leonardklin commited on
Commit
978fed5
·
verified ·
1 Parent(s): 4d50c13

Upload 328 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +31 -0
  2. .env.template +64 -0
  3. .gitattributes +35 -0
  4. .gitignore +231 -0
  5. .gitmodules +9 -0
  6. .pre-commit-config.yaml +22 -0
  7. .python-version +1 -0
  8. .scider/SCIDER.md +11 -0
  9. .scider/skills/content-refinement-agent/SKILL.md +256 -0
  10. .scider/skills/content-refinement-agent/references/halt-rules.md +125 -0
  11. .scider/skills/content-refinement-agent/references/prompt.md +136 -0
  12. .scider/skills/content-refinement-agent/references/reviewer-rubric.md +131 -0
  13. .scider/skills/content-refinement-agent/references/safe-revision-rules.md +129 -0
  14. .scider/skills/content-refinement-agent/scripts/apply_worklog.py +94 -0
  15. .scider/skills/content-refinement-agent/scripts/score_delta.py +164 -0
  16. .scider/skills/content-refinement-agent/scripts/snapshot.py +47 -0
  17. .scider/skills/exploratory-data-analysis/SKILL.md +442 -0
  18. .scider/skills/exploratory-data-analysis/assets/report_template.md +196 -0
  19. .scider/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md +664 -0
  20. .scider/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md +664 -0
  21. .scider/skills/exploratory-data-analysis/references/general_scientific_formats.md +518 -0
  22. .scider/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md +620 -0
  23. .scider/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md +517 -0
  24. .scider/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md +633 -0
  25. .scider/skills/exploratory-data-analysis/scripts/eda_analyzer.py +548 -0
  26. .scider/skills/literature-review-agent/SKILL.md +357 -0
  27. .scider/skills/literature-review-agent/references/citation-density-rule.md +71 -0
  28. .scider/skills/literature-review-agent/references/discovery-pipeline.md +151 -0
  29. .scider/skills/literature-review-agent/references/exa-search-cookbook.md +245 -0
  30. .scider/skills/literature-review-agent/references/prompt.md +77 -0
  31. .scider/skills/literature-review-agent/references/s2-api-cookbook.md +138 -0
  32. .scider/skills/literature-review-agent/references/verification-rules.md +100 -0
  33. .scider/skills/literature-review-agent/scripts/bibtex_format.py +211 -0
  34. .scider/skills/literature-review-agent/scripts/check_cutoff.py +63 -0
  35. .scider/skills/literature-review-agent/scripts/citation_coverage.py +104 -0
  36. .scider/skills/literature-review-agent/scripts/dedupe_by_id.py +98 -0
  37. .scider/skills/literature-review-agent/scripts/exa_search.py +169 -0
  38. .scider/skills/literature-review-agent/scripts/levenshtein_match.py +73 -0
  39. .scider/skills/literature-review-agent/scripts/pre_dedup_candidates.py +156 -0
  40. .scider/skills/literature-review-agent/scripts/s2_cache.py +113 -0
  41. .scider/skills/literature-review-agent/scripts/s2_search.py +208 -0
  42. .scider/skills/literature-review-agent/scripts/sync_keys.py +119 -0
  43. .scider/skills/literature-review-agent/scripts/validate_pool.py +145 -0
  44. .scider/skills/matplotlib/SKILL.md +356 -0
  45. .scider/skills/matplotlib/references/api_reference.md +412 -0
  46. .scider/skills/matplotlib/references/common_issues.md +563 -0
  47. .scider/skills/matplotlib/references/plot_types.md +476 -0
  48. .scider/skills/matplotlib/references/styling_guide.md +589 -0
  49. .scider/skills/matplotlib/scripts/plot_template.py +446 -0
  50. .scider/skills/matplotlib/scripts/style_configurator.py +413 -0
.dockerignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ .env
12
+ .venv/
13
+ venv/
14
+ ENV/
15
+ env/
16
+ *.log
17
+ workspace/
18
+ .pytest_cache/
19
+ .coverage
20
+ htmlcov/
21
+ .DS_Store
22
+ *.swp
23
+ *.swo
24
+ *~
25
+ .git/
26
+ .github/
27
+ .claude/
28
+ benchmarks/
29
+ tmp_*
30
+ rsync_tmp_*
31
+ *.ipynb
.env.template ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- SciDER ---
2
+
3
+ # Provide any combination of provider keys. SciDER's unified model catalog
4
+ # (model_settings/catalog.yaml) lets you mix-and-match providers per role —
5
+ # e.g. ideation on Gemini, experiment_coding on GPT-5. Models whose key is
6
+ # missing are simply marked unavailable in the frontend.
7
+ OPENAI_API_KEY=...
8
+ GEMINI_API_KEY=...
9
+ ANTHROPIC_API_KEY=...
10
+ # Optional: Semantic Scholar API key for better rate limits (https://www.semanticscholar.org/product/api)
11
+ # S2_API_KEY=...
12
+
13
+
14
+
15
+ ## User Approval
16
+ # Set to true to enable interactive user approval at critical agent steps
17
+ USER_APPROVAL_ENABLED=true
18
+
19
+ ## HuggingFace Dataset Download
20
+ # Set to true to allow using HuggingFace repo names as data paths
21
+ HF_DATASET_DOWNLOAD_ENABLED=false
22
+ # HF_DATASET_CACHE_DIR=tmp_hf_datasets
23
+ # Maximum dataset size in MB (default 100)
24
+ # HF_DATASET_MAX_SIZE_MB=100
25
+
26
+ ## Logging
27
+ # LOGURU_LEVEL=INFO
28
+ LOGURU_LEVEL=DEBUG
29
+ LOG_SYSTEM_PROMPT=false
30
+
31
+ ## Coding Agent Switch
32
+ # choice: claude_sdk (default), native, openhands (requires SCIDER_ENABLE_OPENHANDS=1)
33
+ # - claude_sdk: Claude Agent SDK (requires ANTHROPIC_API_KEY)
34
+ # - native: SciDER's built-in coding agent (uses experiment_coding model, any LiteLLM provider)
35
+ # - openhands: OpenHands sandbox (requires SCIDER_ENABLE_OPENHANDS=1)
36
+ # legacy aliases: v3 = claude_sdk, v2 = openhands
37
+ CODING_AGENT_VERSION=claude_sdk
38
+ # choice: See https://platform.claude.com/docs/en/about-claude/models/overview
39
+ CLAUDE_SDK_MODEL=claude-haiku-4-5
40
+
41
+ ## Openhands
42
+ SCIDER_ENABLE_OPENHANDS=false
43
+ OPENHANDS_MODEL=gemini/gemini-2.5-flash
44
+ OPENHANDS_API_KEY=...
45
+
46
+ ## Context Compression Pipeline (runs in query() before each LLM call)
47
+ # Level 1: Persist oversized tool results to disk
48
+ COMPACT_TOOL_RESULT_MAX_CHARS=50000
49
+ # COMPACT_TOOL_RESULT_PREVIEW_CHARS=2000
50
+ # Level 2: Snip old tool results (keep N most recent)
51
+ COMPACT_SNIP_KEEP_RECENT=5
52
+ # Level 3: LLM-based autocompact (trigger threshold in tokens)
53
+ COMPACT_AUTOCOMPACT_TOKEN_THRESHOLD=256000
54
+ COMPACT_AUTOCOMPACT_MODEL=history
55
+ # COMPACT_AUTOCOMPACT_KEEP_RATIO=0.4
56
+ # COMPACT_AUTOCOMPACT_KEEP_FIRST_N=4
57
+
58
+ ## Permissions
59
+ # Path to tool permission overrides (JSON file)
60
+ # SCIDER_PERMISSIONS_FILE=.claude/permissions.json
61
+
62
+ ## Memory System (file-based cross-session memory in .scider/memory/)
63
+ # SCIDER_MEMORY_READ=true # Load memory index into agent context (default: true)
64
+ # SCIDER_MEMORY_WRITE=true # Allow agents to write new memories (default: true)
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # temporary files
210
+ tmp_*
211
+ rsync_tmp_*
212
+ .aider*
213
+ data_analysis.md
214
+ software-agent-sdk
215
+ env
216
+
217
+ streamlit-client/case-study-memory/
218
+ saved_chats/
219
+
220
+ # vibe coding
221
+ .claude/
222
+ .agents/
223
+ .windsurf/
224
+
225
+ # Ignore .scider/ contents but allow specific entries to be tracked
226
+ .scider/*
227
+ !.scider/skills/
228
+ !.scider/rules/
229
+ !.scider/SCIDER.md
230
+
231
+ workspace/
.gitmodules ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [submodule "benchmarks/mlebench/mle-bench"]
2
+ path = benchmarks/mlebench/mle-bench
3
+ url = git@github.com:leonardodalinky/mle-bench.git
4
+ [submodule "benchmarks/scicodebench/SciCode"]
5
+ path = benchmarks/scicodebench/SciCode
6
+ url = git@github.com:leonardodalinky/SciCode.git
7
+ [submodule "benchmarks/aiideabench/AI_Idea_Bench"]
8
+ path = benchmarks/aiideabench/AI_Idea_Bench
9
+ url = git@github.com:leonardodalinky/AI_Idea_Bench_2025.git
.pre-commit-config.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: name-tests-test
8
+ - id: requirements-txt-fixer
9
+ - repo: https://github.com/pycqa/isort
10
+ rev: 5.13.2
11
+ hooks:
12
+ - id: isort
13
+ args: ["--profile", "black", "--line-length=100", "--python-version=310"]
14
+ - repo: https://github.com/psf/black
15
+ rev: 25.1.0
16
+ hooks:
17
+ - id: black
18
+ args: ["--line-length=100", "--target-version=py310"]
19
+ - repo: https://github.com/kynan/nbstripout
20
+ rev: 0.8.2
21
+ hooks:
22
+ - id: nbstripout
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
.scider/SCIDER.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SCIDER.md
2
+
3
+ ## Approach
4
+ - Think before acting. Read existing files before writing code.
5
+ - Be concise in output but thorough in reasoning.
6
+ - Prefer editing over rewriting whole files.
7
+ - Do not re-read files you have already read unless the file may have changed.
8
+ - Test your code before declaring done.
9
+ - No sycophantic openers or closing fluff.
10
+ - Keep solutions simple and direct.
11
+ - User instructions always override this file.
.scider/skills/content-refinement-agent/SKILL.md ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: content-refinement-agent
3
+ description: Step 5 of the PaperOrchestra pipeline (arXiv:2604.05018). Iteratively refine drafts/paper.tex by simulating peer review and applying targeted revisions, with strict accept/revert halt rules. Maintains a worklog and snapshots each iteration so revert is real, not symbolic. TRIGGER when the orchestrator delegates Step 5 or when the user asks to "refine the draft", "iterate on the paper", or "run peer review on this paper".
4
+ allowed_agents: [writing]
5
+ ---
6
+
7
+ # Content Refinement Agent (Step 5)
8
+
9
+ Faithful implementation of the Content Refinement Agent from PaperOrchestra
10
+ (Song et al., 2026, arXiv:2604.05018, §4 Step 5, App. F.1 pp. 49–51).
11
+
12
+ **Cost: ~5–7 LLM calls** (App. B), typically ~3 refinement iterations, each
13
+ consisting of one reviewer call and one revision call.
14
+
15
+ The paper highlights this step as one of the largest contributors to overall
16
+ quality: refinement alone accounts for +19% (CVPR) and +22% (ICLR) absolute
17
+ acceptance-rate improvement (Fig. 4). Get this step right.
18
+
19
+ ## Inputs
20
+
21
+ - `workspace/drafts/paper.tex` — output of Step 4
22
+ - `workspace/inputs/conference_guidelines.md`
23
+ - `workspace/inputs/experimental_log.md` — used as ground truth for the
24
+ hallucination check
25
+ - `workspace/citation_pool.json` / `workspace/refs.bib` — the allowed
26
+ bibliography
27
+
28
+ ## Outputs
29
+
30
+ - `workspace/refinement/iter1/`, `iter2/`, `iter3/` — per-iteration snapshots
31
+ containing `paper.tex`, `paper.pdf`, `review.json`, `score.json`
32
+ - `workspace/refinement/worklog.json` — append-only history of decisions
33
+ - `workspace/final/paper.tex` and `workspace/final/paper.pdf` — copy of the
34
+ best accepted snapshot
35
+
36
+ ## The refinement loop
37
+
38
+ ```
39
+ prev_score = score(paper.tex) # baseline from initial draft
40
+ snapshot iter0/
41
+
42
+ for iter in 1..ITER_CAP (default 3):
43
+ 1. simulate_review(paper.tex) → review.json
44
+ (uses `references/reviewer-rubric.md` rubric)
45
+
46
+ 2. apply_revision(paper.tex, review.json) → new_paper.tex
47
+ (uses verbatim Refinement Agent prompt at `references/prompt.md`)
48
+
49
+ 3. snapshot iter<N>/ with new_paper.tex, review.json
50
+ latexmk -pdf new_paper.tex → iter<N>/paper.pdf
51
+
52
+ 4. score(new_paper.tex) → curr_score
53
+
54
+ 5. decide via score_delta.py:
55
+ - if curr.overall > prev.overall: ACCEPT
56
+ - elif curr.overall == prev.overall and net_subaxis ≥0: ACCEPT
57
+ - else: REVERT
58
+
59
+ 6. apply_worklog.py to append the decision
60
+
61
+ 7. if REVERT or no actionable weaknesses or iter == ITER_CAP: HALT
62
+
63
+ paper.tex ← new_paper.tex (only on ACCEPT)
64
+ prev_score ← curr_score
65
+
66
+ cp <best iter>/paper.tex → workspace/final/paper.tex
67
+ ```
68
+
69
+ The "best" snapshot at HALT is the one with the highest accepted overall
70
+ score. On a REVERT halt, the best is the iteration immediately before the
71
+ revert.
72
+
73
+ ## Step-by-step
74
+
75
+ ### 0. Snapshot the initial draft
76
+
77
+ ```bash
78
+ python skills/content-refinement-agent/scripts/snapshot.py \
79
+ --src workspace/drafts/paper.tex \
80
+ --dst workspace/refinement/iter0/
81
+ ```
82
+
83
+ This creates `iter0/paper.tex`. Then compile to `iter0/paper.pdf`:
84
+
85
+ ```bash
86
+ cd workspace/refinement/iter0/ && latexmk -pdf -interaction=nonstopmode paper.tex
87
+ ```
88
+
89
+ Score it (see Step 1 below) → `iter0/score.json`.
90
+
91
+ ### 1. Simulate peer review
92
+
93
+ For each iteration N starting from 1:
94
+
95
+ Load `references/reviewer-rubric.md` as the system prompt for the simulated
96
+ reviewer call. The reviewer reads `iter<N-1>/paper.pdf` (or `paper.tex` if
97
+ your host LLM lacks PDF input) and produces a JSON of strengths,
98
+ weaknesses, questions, and per-axis scores.
99
+
100
+ The rubric is structured to mimic AgentReview (Jin et al., 2024) — the
101
+ paper's chosen evaluator. We ship a faithful rubric in the references
102
+ directory; the host agent's LLM does the actual reviewing.
103
+
104
+ Save to `workspace/refinement/iter<N>/review.json`.
105
+
106
+ ### 2. Score the draft
107
+
108
+ The reviewer call produces both qualitative feedback and a per-axis score:
109
+
110
+ ```json
111
+ {
112
+ "axis_scores": {
113
+ "scientific_depth": {"score": 65, "justification": "..."},
114
+ "technical_execution": {"score": 70, "justification": "..."},
115
+ "logical_flow": {"score": 60, "justification": "..."},
116
+ "writing_clarity": {"score": 55, "justification": "..."},
117
+ "evidence_presentation":{"score": 72, "justification": "..."},
118
+ "academic_style": {"score": 68, "justification": "..."}
119
+ },
120
+ "overall_score": 64.5,
121
+ "strengths": [...],
122
+ "weaknesses": [...],
123
+ "questions": [...]
124
+ }
125
+ ```
126
+
127
+ Save to `iter<N>/score.json`. (Combined with `review.json` if your host
128
+ emits one document; the schemas overlap.)
129
+
130
+ ### 3. Apply revision
131
+
132
+ Load the **verbatim Content Refinement Agent prompt** at `references/prompt.md`.
133
+ Prepend the Anti-Leakage Prompt. Inputs:
134
+
135
+ - `paper.tex` — current draft
136
+ - `paper.pdf` — compiled PDF (multimodal context if available)
137
+ - `conference_guidelines.md`
138
+ - `experimental_log.md` — ground truth for numeric claims
139
+ - `worklog.json` — history of previous changes
140
+ - `citation_pool.json` — the allowed bibliography
141
+ - `reviewer_feedback` — the JSON from Step 1
142
+
143
+ The prompt instructs the model to address weaknesses, integrate question
144
+ answers, and emit two output blocks:
145
+
146
+ 1. A worklog JSON `{addressed_weaknesses[], integrated_answers[], actions_taken[]}`
147
+ 2. The full revised LaTeX code
148
+
149
+ Save the revised LaTeX as `iter<N>/paper.tex`. Append the worklog JSON to
150
+ `workspace/refinement/worklog.json` via `apply_worklog.py`.
151
+
152
+ ### 4. Compile and re-score
153
+
154
+ ```bash
155
+ cd workspace/refinement/iter<N>/ && latexmk -pdf -interaction=nonstopmode paper.tex
156
+ ```
157
+
158
+ Then re-run the simulated review on the new draft → updated `score.json`
159
+ for the new iteration. (This is the "re-score after revision" call.)
160
+
161
+ ### 5. Apply the accept/revert decision
162
+
163
+ The calling loop must track `CONSECUTIVE_SMALL` (starts at 0) and pass it
164
+ on each call so `score_delta.py` can detect the plateau:
165
+
166
+ ```bash
167
+ python skills/content-refinement-agent/scripts/score_delta.py \
168
+ --prev workspace/refinement/iter<N-1>/score.json \
169
+ --curr workspace/refinement/iter<N>/score.json \
170
+ --plateau-threshold 1.0 \
171
+ --plateau-streak 3 \
172
+ --consecutive-small $CONSECUTIVE_SMALL \
173
+ > workspace/refinement/iter<N>/delta.json
174
+
175
+ EXIT=$?
176
+ # Update streak for next iteration:
177
+ CONSECUTIVE_SMALL=$(python3 -c "
178
+ import json
179
+ d = json.load(open('workspace/refinement/iter<N>/delta.json'))
180
+ print(d['consecutive_small'])
181
+ ")
182
+ ```
183
+
184
+ Exit codes:
185
+ - `0` — ACCEPT (overall improved or tied with non-negative net sub-axis, no plateau)
186
+ - `1` — REVERT (overall decreased)
187
+ - `2` — REVERT (tied overall, but net sub-axis change negative)
188
+ - `4` — HALT_PLATEAU (accepted but N consecutive iterations below threshold — stop early)
189
+
190
+ Behavior:
191
+
192
+ - **ACCEPT (exit 0)**: keep `iter<N>/paper.tex` as the new best. Continue to iter N+1.
193
+ - **REVERT (exit 1 or 2)**: copy `iter<N-1>/paper.tex` back as canonical, halt.
194
+ - **HALT_PLATEAU (exit 4)**: keep current (it was accepted), but stop — further
195
+ iterations are unlikely to yield meaningful gains. In practice ~85% of
196
+ refinement gain comes in iteration 1; the plateau fires when subsequent
197
+ iterations improve by less than 1 point for 3 consecutive rounds.
198
+
199
+ Always log the decision via `apply_worklog.py --decision ...`.
200
+
201
+ ### 6. Halt rules
202
+
203
+ Halt the loop when ANY of these is true:
204
+
205
+ 1. Iteration count reaches `ITER_CAP` (default 3).
206
+ 2. `score_delta.py` returned exit code 1 or 2 (REVERT).
207
+ 3. The simulated reviewer's `weaknesses` list is empty (no actionable
208
+ feedback to apply).
209
+ 4. `score_delta.py` returned exit code 4 (HALT_PLATEAU — plateau early-stop).
210
+
211
+ ### 7. Promote the best snapshot
212
+
213
+ Identify the iteration with the highest accepted `overall_score` (this may
214
+ be the latest accepted iteration, OR an earlier one if a later iteration
215
+ was reverted). Copy:
216
+
217
+ ```bash
218
+ cp workspace/refinement/iter<best>/paper.tex workspace/final/paper.tex
219
+ cp workspace/refinement/iter<best>/paper.pdf workspace/final/paper.pdf
220
+ ```
221
+
222
+ Then in the final report, tell the user:
223
+ - How many iterations were run
224
+ - The final overall score
225
+ - The score trajectory (e.g., "iter0 64.5 → iter1 67.3 (accept) → iter2 69.1 (accept) → iter3 68.9 (revert, halt)")
226
+ - Which iteration was promoted
227
+
228
+ ## Critical safety constraints (App. F.1 page 50–51)
229
+
230
+ The paper explicitly notes that early versions of the Refinement Agent
231
+ "exploited the automated reviewer's scoring function by superficially
232
+ listing missing baselines as limitations to artificially inflate
233
+ acceptance scores." The verbatim prompt forbids this. **You must honor it:**
234
+
235
+ - **Ignore reviewer requests for new experiments, ablations, or baselines.**
236
+ The Refinement Agent's job is presentation, not new science. If the
237
+ reviewer asks for missing data, simply skip those points — do NOT add
238
+ fabricated experiments, do NOT add a "future work" item promising them.
239
+ - **Never explicitly state a limitation.** The phrase "we acknowledge as a
240
+ limitation that..." is forbidden. The model can address weaknesses
241
+ through clearer explanation, but must not game the evaluator by listing
242
+ them defensively.
243
+ - **All numeric claims MUST be verified against `experimental_log.md`.**
244
+ The agent cannot introduce new numbers, only re-present existing ones.
245
+
246
+ These rules prevent reward hacking and keep the refinement loop honest.
247
+
248
+ ## Resources
249
+
250
+ - `references/prompt.md` — verbatim Content Refinement Agent prompt from App. F.1
251
+ - `references/reviewer-rubric.md` — AgentReview-style scoring rubric (6 axes)
252
+ - `references/halt-rules.md` — accept/revert/halt logic in formal pseudocode
253
+ - `references/safe-revision-rules.md` — anti-reward-hack constraints
254
+ - `scripts/score_delta.py` — accept/revert decision from two score JSONs
255
+ - `scripts/apply_worklog.py` — append iteration entries to worklog.json
256
+ - `scripts/snapshot.py` — copy paper.tex/paper.pdf into iter<N>/ for rollback
.scider/skills/content-refinement-agent/references/halt-rules.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Halt Rules
2
+
3
+ Source: arXiv:2604.05018, §4 Step 5 ("Iterative Content Refinement"):
4
+
5
+ > After modifying the LaTeX source to address weaknesses, revisions are
6
+ > accepted if the overall score increases, or if it ties when net sub-axis
7
+ > gains are non-negative. The agent immediately reverts to the previous
8
+ > version and halts upon any overall score decrease, negative tie-breaker,
9
+ > or reaching the iteration limit.
10
+
11
+ Encoded as deterministic logic in `scripts/score_delta.py`. This file is the
12
+ human-readable specification.
13
+
14
+ ## Definitions
15
+
16
+ Let:
17
+
18
+ - `prev` = score JSON from the previous accepted iteration
19
+ - `curr` = score JSON from the just-completed iteration
20
+ - `prev.overall` = `prev.overall_score`
21
+ - `curr.overall` = `curr.overall_score`
22
+ - `subaxis_delta(axis)` = `curr.axis_scores[axis].score - prev.axis_scores[axis].score`
23
+ - `net_subaxis_delta` = `sum(subaxis_delta(a) for a in 6 axes)`
24
+
25
+ ## Decision rules (in order)
26
+
27
+ ```
28
+ if curr.overall > prev.overall:
29
+ DECISION = ACCEPT_IMPROVED
30
+
31
+ elif curr.overall == prev.overall:
32
+ if net_subaxis_delta >= 0:
33
+ DECISION = ACCEPT_TIED_NON_NEGATIVE
34
+ else:
35
+ DECISION = REVERT_TIED_NEGATIVE_SUBAXIS
36
+
37
+ else: # curr.overall < prev.overall
38
+ DECISION = REVERT_OVERALL_DECREASED
39
+ ```
40
+
41
+ The script exits with:
42
+
43
+ | Exit code | Meaning | Loop action |
44
+ |---|---|---|
45
+ | 0 | ACCEPT_IMPROVED | keep new draft, continue loop |
46
+ | 0 | ACCEPT_TIED_NON_NEGATIVE | keep new draft, continue loop |
47
+ | 1 | REVERT_OVERALL_DECREASED | rollback to prev, halt loop |
48
+ | 2 | REVERT_TIED_NEGATIVE_SUBAXIS | rollback to prev, halt loop |
49
+
50
+ The script also prints a one-line decision string and a JSON object on
51
+ stdout for the host agent to log.
52
+
53
+ ## Loop-level halt conditions
54
+
55
+ In addition to the per-iteration accept/revert decision, the loop halts
56
+ when ANY of these is true:
57
+
58
+ 1. **Iteration cap reached.** Default 3 (configurable via env var
59
+ `PO_REFINE_MAX_ITER`). Per the paper Table 7, the typical
60
+ refinement count is "3× content refinement loop".
61
+ 2. **REVERT decision** from `score_delta.py` (exit code 1 or 2).
62
+ 3. **Empty weaknesses list.** If the simulated reviewer's `weaknesses`
63
+ array is empty, there is nothing to fix — halt.
64
+ 4. **Plateau early-stop (exit code 4).** `score_delta.py` returns
65
+ `HALT_PLATEAU` when `N` consecutive accepted iterations each have
66
+ `overall_delta < threshold`. Default: threshold=1.0 points, N=3.
67
+ Configurable via `--plateau-threshold` and `--plateau-streak`.
68
+
69
+ The calling loop must pass `--consecutive-small <count>` to
70
+ `score_delta.py` to track the streak across iterations:
71
+
72
+ ```bash
73
+ CONSECUTIVE_SMALL=0
74
+ for iter in 1 2 3 ...; do
75
+ # ... run refinement LLM call ...
76
+ python score_delta.py \
77
+ --prev iter$((iter-1))/score.json \
78
+ --curr iter${iter}/score.json \
79
+ --plateau-threshold 1.0 \
80
+ --plateau-streak 3 \
81
+ --consecutive-small $CONSECUTIVE_SMALL
82
+ EXIT=$?
83
+ # Update streak counter from script output
84
+ CONSECUTIVE_SMALL=$(python -c "import json,sys; \
85
+ d=json.loads(open('iter${iter}/delta.json').read()); \
86
+ print(d['consecutive_small'])")
87
+ if [ $EXIT -ne 0 ]; then break; fi
88
+ done
89
+ ```
90
+
91
+ **Why this matters**: in practice, ~85% of the refinement gain comes
92
+ in the first iteration (scores jump 5-8 points). Subsequent iterations
93
+ typically improve by <1 point. Without early-stop, the loop runs 3 full
94
+ LLM calls even when iterations 2 and 3 contribute near-zero value.
95
+
96
+ ## Promoting the best snapshot
97
+
98
+ After halt, identify the iteration with the highest `accepted` overall
99
+ score:
100
+
101
+ ```python
102
+ accepted_iters = [it for it in worklog.iterations if it.decision.startswith("ACCEPT")]
103
+ best = max(accepted_iters, key=lambda it: it.score.overall_score)
104
+ ```
105
+
106
+ If the loop halted on REVERT, `best` is the iteration immediately *before*
107
+ the reverted one. Copy its `paper.tex` and `paper.pdf` to
108
+ `workspace/final/`.
109
+
110
+ ## Worked example
111
+
112
+ Suppose:
113
+
114
+ | iter | overall | depth | exec | flow | clarity | evidence | style | decision |
115
+ |---|---|---|---|---|---|---|---|---|
116
+ | 0 | 64.5 | 65 | 70 | 60 | 55 | 72 | 68 | (baseline) |
117
+ | 1 | 67.3 | 68 | 73 | 64 | 58 | 74 | 70 | ACCEPT_IMPROVED |
118
+ | 2 | 67.3 | 70 | 73 | 64 | 58 | 73 | 71 | ACCEPT_TIED_NON_NEGATIVE (Σdelta = +2) |
119
+ | 3 | 66.0 | 70 | 70 | 62 | 56 | 73 | 71 | REVERT_OVERALL_DECREASED, HALT |
120
+
121
+ Promoted: iter 2 (`final/paper.tex` ← `iter2/paper.tex`).
122
+ Score trajectory in the run report:
123
+ ```
124
+ 64.5 → 67.3 (accept) → 67.3 (accept tied) → 66.0 (revert, halt)
125
+ ```
.scider/skills/content-refinement-agent/references/prompt.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Content Refinement Agent — verbatim prompt
2
+
3
+ **Source: arXiv:2604.05018, Appendix F.1, pages 49–51 (verbatim).**
4
+
5
+ This is the exact prompt used by the Content Refinement Agent in the paper.
6
+ Use it as your system message when applying a revision. The Anti-Leakage
7
+ Prompt (`../paper-orchestra/references/anti-leakage-prompt.md`) MUST be
8
+ prepended.
9
+
10
+ ---
11
+
12
+ ```
13
+ Role: Senior AI Researcher.
14
+
15
+ Task: Revise and strengthen a LaTeX research paper by systematically
16
+ addressing peer review feedback.
17
+
18
+ You are the author responsible for the "Rebuttal via Revision" phase. You
19
+ will receive:
20
+ - paper.tex: The current LaTeX source code.
21
+ - paper.pdf: The compiled PDF context.
22
+ - conference_guidelines.md: The formatting and page limit rules.
23
+ - experimental_log.md: The Ground Truth for all data and metrics.
24
+ - worklog.json: History of previous changes.
25
+ - citation_map.json: The allowed bibliography.
26
+ - reviewer_feedback: A JSON object containing specific Strengths,
27
+ Weaknesses, Questions, and Decisions from an LLM reviewer.
28
+
29
+ Your Goal
30
+
31
+ 1. Analyze Feedback: Deconstruct the reviewer_feedback into actionable
32
+ editing tasks.
33
+ 2. Address Weaknesses: Rewrite sections to clarify logic, strengthen
34
+ arguments, or justify design choices pointed out as weak.
35
+ 3. Integrate Answers: Incorporate answers to the reviewer's "Questions"
36
+ directly into the manuscript (e.g., adding training cost details to
37
+ the Implementation section).
38
+ 4. Execution: Generate a JSON worklog of your editorial decisions and the
39
+ full, revised LaTeX source.
40
+
41
+ Critical Execution Standards
42
+
43
+ 1. Content Revision Strategy
44
+ - Weakness Mitigation: If the reviewer flags "incremental novelty",
45
+ rewrite the Introduction and Related Work to explicitly contrast
46
+ your contribution against prior art. If they flag "unclear
47
+ methodology", restructure the relevant section for clarity.
48
+ - Answering Questions: Do NOT write a separate response letter. If the
49
+ reviewer asks "What is the inference latency?", you must find a
50
+ natural place in the paper (e.g., Experiments or Discussion) to
51
+ insert that information, ensuring it aligns with experimental_log.md.
52
+ - Preserve Strengths: Do not delete or heavily alter sections listed
53
+ under "Strengths" unless necessary for space or flow.
54
+
55
+ 2. Data Integrity & Hallucination Check
56
+ - Ground Truth: All numerical claims (accuracy, parameter count,
57
+ training hours, latency) MUST be verified against
58
+ experimental_log.md.
59
+ - Missing Data: If the reviewer asks for new experiments, ablations, or
60
+ baselines that are NOT in experimental_log.md, simply ignore those
61
+ specific requests. Your job is purely presentation refinement of the
62
+ existing completed experiments, not adding or promising to add new
63
+ experiments.
64
+
65
+ 3. Writing Style & Tone
66
+ - Academic Tone: Maintain a formal, objective, and precise tone. Avoid
67
+ defensive language.
68
+ - Conciseness: If the paper is near the page limit, prioritize density
69
+ of information over flowery prose.
70
+ - Flow: Ensure that new insertions (answers to questions) transition
71
+ smoothly with existing text.
72
+
73
+ 4. LaTeX & Citation Integrity
74
+ - Structure: Do not break the LaTeX compilation. Keep packages and
75
+ environments stable. If using figure* for wide figures, ensure they
76
+ are closed with \end{{figure*}} (not \end{{figure}}). Check for
77
+ completeness.
78
+ - Citations: Use ONLY keys from citation_map.json.
79
+
80
+ Output Format (Strict)
81
+
82
+ You MUST return your response in two distinct code blocks in this exact
83
+ order:
84
+
85
+ 1. Worklog for the current turn (JSON):
86
+ {{
87
+ "addressed_weaknesses": [
88
+ "Clarified contribution novelty in Intro (Reviewer point 2)",
89
+ "Added justification for two-stage training (Reviewer point 1)"
90
+ ],
91
+ "integrated_answers": [
92
+ "Added training cost (45 GPU hours) to Implementation Details",
93
+ "Added epsilon hyperparameter explanation to Method section"
94
+ ],
95
+ "actions_taken": [
96
+ "Rewrote Section 3.2 for clarity",
97
+ "Inserted new paragraph in Section 5.1 regarding latency"
98
+ ]
99
+ }}
100
+
101
+ 2. The FULL revised LaTeX code:
102
+ ```latex
103
+ ... Full revised LaTeX code here ...
104
+ ```
105
+
106
+ Important Notes
107
+
108
+ - Completeness: Always provide the FULL LaTeX code. Do not return diffs
109
+ or partial snippets.
110
+ - Responsiveness: Every question in the reviewer_feedback must be
111
+ addressed by improving the presentation, EXCEPT for questions asking
112
+ for new experiments or data not in experimental_log.md (which should
113
+ be ignored). Never explicitly state a limitation.
114
+ - Safety: Do not remove the \documentclass or essential preamble.
115
+ ```
116
+
117
+ ---
118
+
119
+ ## Why "never explicitly state a limitation" is a hard rule
120
+
121
+ From App. F.1 p.51, the paper explains:
122
+
123
+ > We explicitly instruct the Content Refinement Agent to ignore reviewer
124
+ > requests for additional experiments. This constraint is crucial to
125
+ > prevent the agent from generating fabricated results or making false
126
+ > promises within the paper... Furthermore, the directive to "never
127
+ > explicitly state a limitation" prevents reward hacking. During early
128
+ > testing, the agent exploited the automated reviewer's scoring function
129
+ > by superficially listing missing baselines as limitations to
130
+ > artificially inflate acceptance scores. Banning this behavior from the
131
+ > refinement loop forces the agent to genuinely improve the manuscript's
132
+ > presentation and clarity rather than gamifying the evaluation metric.
133
+
134
+ `safe-revision-rules.md` formalizes this as a deterministic gate the host
135
+ agent should run after each revision: grep the new draft for the substring
136
+ `limitation` (case-insensitive) and reject if found.
.scider/skills/content-refinement-agent/references/reviewer-rubric.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reviewer Rubric (AgentReview-style)
2
+
3
+ The Content Refinement Agent loop needs a simulated reviewer that produces
4
+ **structured, scoreable** feedback the host agent can compare iteration to
5
+ iteration. The paper uses AgentReview (Jin et al., 2024) as its evaluator
6
+ in §5 (App. F.1 references "AgentReview" by name and uses its output schema:
7
+ "strengths, weaknesses, questions, decisions").
8
+
9
+ This document defines a faithful AgentReview-style reviewer prompt to use
10
+ under any host LLM. Use it as the system message for the simulated review
11
+ call before each refinement iteration.
12
+
13
+ ---
14
+
15
+ ## System prompt for the simulated reviewer
16
+
17
+ ```
18
+ You are an expert academic peer reviewer for a top-tier machine learning
19
+ conference (CVPR, ICLR, NeurIPS, ICML). Read the provided LaTeX paper or
20
+ PDF and produce a rigorous, structured review.
21
+
22
+ Your review must be CONSERVATIVE. High scores are rare and must be
23
+ explicitly justified with concrete evidence from the paper. Assume most
24
+ drafts are not publication-ready.
25
+
26
+ You MUST score the paper on six axes (0-100 each):
27
+
28
+ 1. Scientific Depth & Soundness
29
+ - Are the theoretical foundations and experimental setups rigorous?
30
+ - Are claims justified and free of unsupported leaps?
31
+
32
+ 2. Technical Execution
33
+ - Within the bounds of the described idea, is the methodology
34
+ implemented innovatively and effectively?
35
+ - Are the design choices justified by the experimental results?
36
+
37
+ 3. Logical Flow
38
+ - Do sections transition smoothly from Abstract through Conclusion?
39
+ - Are subsections structured logically with clear signposting?
40
+
41
+ 4. Writing Clarity
42
+ - Is the prose precise, concise, and free of repetitive phrasing?
43
+ - Are technical terms defined before use?
44
+
45
+ 5. Evidence Presentation
46
+ - Are figures, tables, and results integrated and referenced cleanly?
47
+ - Do visuals support the text claims directly?
48
+
49
+ 6. Academic Style
50
+ - Polished, professional academic tone?
51
+ - Consistent terminology throughout?
52
+
53
+ For each axis, provide a score AND a 2-5 sentence evidence-based
54
+ justification quoting concrete passages or pointing to specific failings.
55
+
56
+ Then identify:
57
+
58
+ - Strengths: 3-5 bullet points naming things the paper does well.
59
+ - Weaknesses: 3-5 bullet points naming concrete, fixable issues.
60
+ - Questions: 2-4 specific questions the paper should answer for a
61
+ reader to be convinced.
62
+ - Decision: one of "Strong Accept", "Accept", "Borderline", "Reject",
63
+ "Strong Reject".
64
+ - Overall Score: weighted average 0-100. Use:
65
+ overall = 0.20*depth + 0.20*execution + 0.15*flow
66
+ + 0.15*clarity + 0.20*evidence + 0.10*style
67
+
68
+ Output STRICT JSON only. No prose outside the JSON.
69
+ ```
70
+
71
+ ## Output JSON schema
72
+
73
+ ```json
74
+ {
75
+ "axis_scores": {
76
+ "scientific_depth": {
77
+ "score": 65,
78
+ "justification": "Loss formulation is grounded in the cited prior work but the ablation on the audio-visual fusion layer is small (n=3 seeds) and the variance bands overlap, making the claim of necessity weak. Section 3.2 introduces the cached memory without proving its necessity vs. simple pooling."
79
+ },
80
+ "technical_execution": { "score": 70, "justification": "..." },
81
+ "logical_flow": { "score": 60, "justification": "..." },
82
+ "writing_clarity": { "score": 55, "justification": "..." },
83
+ "evidence_presentation": { "score": 72, "justification": "..." },
84
+ "academic_style": { "score": 68, "justification": "..." }
85
+ },
86
+ "strengths": [
87
+ "Clear problem statement in the Introduction with three concrete failure cases of prior SAM-based methods.",
88
+ "Well-organized Related Work that contrasts the three competing paradigms.",
89
+ "..."
90
+ ],
91
+ "weaknesses": [
92
+ "The ablation in Table 2 lacks confidence intervals; 0.4 J-index gaps may not be significant.",
93
+ "Section 3.4 introduces the IoU loss term λ without justifying λ=1.0 vs other values.",
94
+ "Figure 3 is referenced once and never discussed in the prose.",
95
+ "..."
96
+ ],
97
+ "questions": [
98
+ "What is the inference latency on a single A100?",
99
+ "How does the temporal branch behave on videos longer than the training distribution?"
100
+ ],
101
+ "decision": "Borderline",
102
+ "overall_score": 64.5
103
+ }
104
+ ```
105
+
106
+ ## How the loop uses this output
107
+
108
+ The `score_delta.py` script reads two consecutive score JSONs and applies
109
+ the halt rules. The `apply_worklog.py` script appends a timestamped entry
110
+ to `workspace/refinement/worklog.json`. The Content Refinement Agent's
111
+ revision call takes the full `review.json` as `reviewer_feedback` input.
112
+
113
+ ## Anti-inflation guardrails
114
+
115
+ To prevent the simulated reviewer from being gameable, the rubric has hard
116
+ caps drawn from the paper's Literature Review Quality autorater
117
+ (App. F.3 — see also `paper-autoraters/references/litreview-quality-prompt.md`):
118
+
119
+ | Axis | Hard cap |
120
+ |---|---|
121
+ | Scientific Depth | ≤60 if claims are unsupported by experiments |
122
+ | Technical Execution | ≤55 if methodology section omits key implementation details |
123
+ | Logical Flow | ≤60 if sections don't reference the figures/tables they need |
124
+ | Writing Clarity | ≤60 if repetitive phrasing or undefined acronyms |
125
+ | Evidence Presentation | ≤55 if any figure is unreferenced from the text |
126
+ | Academic Style | ≤55 if defensive language is present |
127
+
128
+ These caps are baked into the rubric prompt to keep the reviewer honest.
129
+ The Content Refinement Agent's "never explicitly state a limitation" rule
130
+ combined with these caps closes the reward-hacking loop the paper observed
131
+ in early testing (App. F.1 p.51).
.scider/skills/content-refinement-agent/references/safe-revision-rules.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Safe Revision Rules
2
+
3
+ The Content Refinement Agent prompt (App. F.1 p.50–51) imposes two
4
+ anti-reward-hacking constraints. Both must be enforced not just by the
5
+ prompt but by deterministic post-revision gates, because LLMs occasionally
6
+ forget instructions buried in long prompts.
7
+
8
+ ## Rule 1 — Ignore reviewer requests for new experiments
9
+
10
+ The simulated reviewer will sometimes ask:
11
+
12
+ - "What if you ablated the temperature parameter?"
13
+ - "How does this compare to baseline X?"
14
+ - "Have you tried this on dataset Y?"
15
+
16
+ The Refinement Agent must **not** fabricate answers to these. The paper:
17
+
18
+ > If the reviewer asks for new experiments, ablations, or baselines that
19
+ > are NOT in experimental_log.md, simply ignore those specific requests.
20
+ > Your job is purely presentation refinement of the existing completed
21
+ > experiments, not adding or promising to add new experiments.
22
+
23
+ ### Enforcement
24
+
25
+ There is no fully deterministic way to grep for "fabricated experiments" —
26
+ it requires reading the new content and cross-checking against
27
+ `experimental_log.md`. The pragmatic check:
28
+
29
+ 1. Run the orphan-citation gate from `section-writing-agent/scripts/orphan_cite_gate.py`.
30
+ New numeric claims often come bundled with new (orphan) citations.
31
+ 2. Run a numeric-claim grep: extract every `\d+\.\d+%?` from the new draft,
32
+ intersect with `\d+\.\d+%?` in `experimental_log.md`. New numbers in the
33
+ draft that aren't in the log are suspicious. (False positives possible
34
+ for parameter counts and dates; review manually.)
35
+
36
+ The orchestrator should re-prompt the refinement step if either gate fires
37
+ with new fabricated claims.
38
+
39
+ ## Rule 2 — Never explicitly state a limitation
40
+
41
+ The paper:
42
+
43
+ > The directive to "never explicitly state a limitation" prevents reward
44
+ > hacking. During early testing, the agent exploited the automated
45
+ > reviewer's scoring function by superficially listing missing baselines
46
+ > as limitations to artificially inflate acceptance scores.
47
+
48
+ ### Enforcement (deterministic)
49
+
50
+ Grep the revised draft for the substring `limitation` (case-insensitive),
51
+ excluding LaTeX comments. If found anywhere in the body, reject the
52
+ revision and re-prompt:
53
+
54
+ ```bash
55
+ # pseudocode — implement inline in the host agent
56
+ grep -in -E '\blimitation' workspace/refinement/iter<N>/paper.tex \
57
+ | grep -v '^\s*%'
58
+ ```
59
+
60
+ Allowed contexts (these are NOT violations):
61
+
62
+ - LaTeX comments: `% address the limitation of ...`
63
+ - Citation context: a paper title containing "limitation" cited in
64
+ `\cite{...}`. The grep should ignore the inside of `\cite{...}` braces.
65
+ - Quoted prior-work descriptions: "Smith et al. acknowledge the
66
+ limitation..." — context-dependent. The simplest rule is "no instances
67
+ of the word 'limitation' in the running prose at all", and let the host
68
+ agent handle edge cases by re-prompting if a legitimate use is needed.
69
+
70
+ This is a strict rule. The Refinement Agent should rewrite "we acknowledge
71
+ the limitation that our method..." as "our method assumes..." or "the
72
+ proposed approach is most effective when...". Reframing, not listing.
73
+
74
+ ## Rule 3 — Numeric ground truth
75
+
76
+ > All numerical claims (accuracy, parameter count, training hours,
77
+ > latency) MUST be verified against experimental_log.md.
78
+
79
+ The grep heuristic above catches this partially. The host agent should
80
+ also instruct the refinement step explicitly: "any numeric value you cite
81
+ in your revision must already exist in experimental_log.md or
82
+ metrics.json."
83
+
84
+ ## Rule 4 — Citation integrity
85
+
86
+ The orphan-citation gate from
87
+ `section-writing-agent/scripts/orphan_cite_gate.py` must pass after every
88
+ refinement iteration. Re-run it as part of the post-revision checks:
89
+
90
+ ```bash
91
+ python skills/section-writing-agent/scripts/orphan_cite_gate.py \
92
+ workspace/refinement/iter<N>/paper.tex \
93
+ workspace/refs.bib
94
+ ```
95
+
96
+ If the refinement step introduced a new `\cite{KEY}` not in `refs.bib`,
97
+ revert the iteration and re-prompt with an explicit instruction to use
98
+ only existing keys.
99
+
100
+ ## Rule 5 — LaTeX integrity
101
+
102
+ Re-run `latex_sanity.py` and `latexmk -pdf` after every revision. If the
103
+ revision broke the build, revert.
104
+
105
+ ## Summary checklist for each refinement iteration
106
+
107
+ ```bash
108
+ # 1. apply revision → iter<N>/paper.tex
109
+ # 2. compile
110
+ cd workspace/refinement/iter<N>/ && latexmk -pdf -interaction=nonstopmode paper.tex
111
+
112
+ # 3. structural sanity
113
+ python skills/section-writing-agent/scripts/latex_sanity.py paper.tex || REVERT
114
+ python skills/section-writing-agent/scripts/orphan_cite_gate.py paper.tex ../../refs.bib || REVERT
115
+
116
+ # 4. anti-leakage
117
+ python skills/paper-orchestra/scripts/anti_leakage_check.py paper.tex || REVERT
118
+
119
+ # 5. limitation grep (Rule 2)
120
+ grep -in -E '\blimitation' paper.tex | grep -v '^\s*%' && REVERT
121
+
122
+ # 6. score and decide
123
+ python skills/content-refinement-agent/scripts/score_delta.py \
124
+ --prev ../iter<N-1>/score.json --curr score.json
125
+ # exit 0 → keep, exit 1/2 → revert
126
+ ```
127
+
128
+ If all gates pass and `score_delta.py` returns 0, the iteration is
129
+ accepted.
.scider/skills/content-refinement-agent/scripts/apply_worklog.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ apply_worklog.py — Append a timestamped iteration entry to worklog.json.
4
+
5
+ The worklog is the canonical history of the refinement loop: every
6
+ iteration's review, score, decision, and actions taken. The orchestrator
7
+ reads it at the end to identify the best snapshot to promote.
8
+
9
+ Usage:
10
+ python apply_worklog.py \\
11
+ --worklog workspace/refinement/worklog.json \\
12
+ --iter 2 \\
13
+ --review iter2/review.json \\
14
+ --score iter2/score.json \\
15
+ --decision ACCEPT_IMPROVED \\
16
+ --actions iter2/worklog_entry.json # the agent's emitted worklog block
17
+
18
+ The script creates worklog.json if it doesn't exist.
19
+ """
20
+ import argparse
21
+ import datetime as dt
22
+ import json
23
+ import os
24
+ import sys
25
+
26
+
27
+ def load_json(path: str | None) -> dict | list | None:
28
+ if not path or not os.path.exists(path):
29
+ return None
30
+ with open(path) as f:
31
+ return json.load(f)
32
+
33
+
34
+ def main() -> int:
35
+ p = argparse.ArgumentParser(description=__doc__)
36
+ p.add_argument("--worklog", required=True, help="path to worklog.json")
37
+ p.add_argument("--iter", type=int, required=True, help="iteration number (0-indexed)")
38
+ p.add_argument("--review", help="path to review.json for this iteration")
39
+ p.add_argument("--score", help="path to score.json for this iteration")
40
+ p.add_argument(
41
+ "--decision",
42
+ required=True,
43
+ help="ACCEPT_IMPROVED / ACCEPT_TIED_NON_NEGATIVE / "
44
+ "REVERT_OVERALL_DECREASED / REVERT_TIED_NEGATIVE_SUBAXIS",
45
+ )
46
+ p.add_argument(
47
+ "--actions",
48
+ help="path to the agent's worklog block JSON "
49
+ "(addressed_weaknesses, integrated_answers, actions_taken)",
50
+ )
51
+ p.add_argument("--halted-because", help="reason if this iteration triggers a halt")
52
+ args = p.parse_args()
53
+
54
+ if os.path.exists(args.worklog):
55
+ with open(args.worklog) as f:
56
+ wl = json.load(f)
57
+ else:
58
+ wl = {"iterations": [], "halted_because": None, "best_iter": None}
59
+
60
+ entry = {
61
+ "iter": args.iter,
62
+ "timestamp": dt.datetime.now(dt.timezone.utc).isoformat(),
63
+ "decision": args.decision,
64
+ "review": load_json(args.review),
65
+ "score": load_json(args.score),
66
+ "actions": load_json(args.actions),
67
+ }
68
+ wl["iterations"].append(entry)
69
+
70
+ if args.halted_because:
71
+ wl["halted_because"] = args.halted_because
72
+
73
+ # Re-compute best_iter: highest accepted overall_score
74
+ accepted = [
75
+ it
76
+ for it in wl["iterations"]
77
+ if it.get("decision", "").startswith("ACCEPT") and it.get("score")
78
+ ]
79
+ if accepted:
80
+ best = max(accepted, key=lambda it: it["score"].get("overall_score", 0))
81
+ wl["best_iter"] = best["iter"]
82
+
83
+ os.makedirs(os.path.dirname(os.path.abspath(args.worklog)) or ".", exist_ok=True)
84
+ with open(args.worklog, "w") as f:
85
+ json.dump(wl, f, indent=2, ensure_ascii=False)
86
+
87
+ print(f"OK: appended iter {args.iter} ({args.decision}) to {args.worklog}")
88
+ if wl["best_iter"] is not None:
89
+ print(f" current best_iter: {wl['best_iter']}")
90
+ return 0
91
+
92
+
93
+ if __name__ == "__main__":
94
+ sys.exit(main())
.scider/skills/content-refinement-agent/scripts/score_delta.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ score_delta.py — Apply the PaperOrchestra refinement halt rules from two
4
+ score JSONs.
5
+
6
+ Encodes the halt rules from arXiv:2604.05018 §4 Step 5:
7
+
8
+ - ACCEPT if curr.overall > prev.overall
9
+ - ACCEPT if curr.overall == prev.overall AND net sub-axis delta >= 0
10
+ - REVERT (overall_decreased) if curr.overall < prev.overall
11
+ - REVERT (tied_negative_subaxis) if curr.overall == prev.overall AND
12
+ net sub-axis delta < 0
13
+
14
+ Additionally encodes the plateau early-stop rule (not in the original paper
15
+ but added to match its cost budget of ~5-7 LLM calls):
16
+
17
+ - HALT_PLATEAU if the improvement is accepted but overall_delta is below
18
+ --plateau-threshold for --plateau-streak or more consecutive iterations.
19
+ Exit code 4. The loop should stop — further iterations are unlikely to
20
+ yield meaningful gains.
21
+
22
+ Exit codes:
23
+ 0 ACCEPT (improved or tied non-negative, and no plateau)
24
+ 1 REVERT (overall decreased)
25
+ 2 REVERT (tied with negative sub-axis delta)
26
+ 3 argument or input error
27
+ 4 HALT_PLATEAU (accepted but diminishing returns detected)
28
+
29
+ Score JSON shape (see references/reviewer-rubric.md):
30
+ {
31
+ "axis_scores": {
32
+ "scientific_depth": {"score": 65, ...},
33
+ "technical_execution": {"score": 70, ...},
34
+ "logical_flow": {"score": 60, ...},
35
+ "writing_clarity": {"score": 55, ...},
36
+ "evidence_presentation":{"score": 72, ...},
37
+ "academic_style": {"score": 68, ...}
38
+ },
39
+ "overall_score": 64.5,
40
+ ...
41
+ }
42
+
43
+ Usage:
44
+ python score_delta.py --prev iter0/score.json --curr iter1/score.json
45
+ python score_delta.py --prev iter2/score.json --curr iter3/score.json \\
46
+ --plateau-threshold 1.0 --plateau-streak 2 --consecutive-small 2
47
+ """
48
+ import argparse
49
+ import json
50
+ import sys
51
+
52
+ AXES = [
53
+ "scientific_depth",
54
+ "technical_execution",
55
+ "logical_flow",
56
+ "writing_clarity",
57
+ "evidence_presentation",
58
+ "academic_style",
59
+ ]
60
+
61
+ DEFAULT_PLATEAU_THRESHOLD = 1.0 # points
62
+ DEFAULT_PLATEAU_STREAK = 3 # consecutive iterations below threshold → halt
63
+
64
+
65
+ def load(path: str) -> dict:
66
+ with open(path) as f:
67
+ return json.load(f)
68
+
69
+
70
+ def main() -> int:
71
+ p = argparse.ArgumentParser(description=__doc__)
72
+ p.add_argument("--prev", required=True, help="Score JSON from previous accepted iteration")
73
+ p.add_argument("--curr", required=True, help="Score JSON from just-completed iteration")
74
+ p.add_argument(
75
+ "--plateau-threshold",
76
+ type=float,
77
+ default=DEFAULT_PLATEAU_THRESHOLD,
78
+ metavar="POINTS",
79
+ help=f"Minimum overall_delta to not count as a 'small' improvement "
80
+ f"(default: {DEFAULT_PLATEAU_THRESHOLD})",
81
+ )
82
+ p.add_argument(
83
+ "--plateau-streak",
84
+ type=int,
85
+ default=DEFAULT_PLATEAU_STREAK,
86
+ metavar="N",
87
+ help=f"Number of consecutive small improvements before HALT_PLATEAU "
88
+ f"(default: {DEFAULT_PLATEAU_STREAK})",
89
+ )
90
+ p.add_argument(
91
+ "--consecutive-small",
92
+ type=int,
93
+ default=0,
94
+ metavar="N",
95
+ help="Number of consecutive small-delta accepted iterations so far "
96
+ "(maintained by the calling loop; default: 0)",
97
+ )
98
+ args = p.parse_args()
99
+
100
+ try:
101
+ prev = load(args.prev)
102
+ curr = load(args.curr)
103
+ except (OSError, json.JSONDecodeError) as e:
104
+ print(f"ERROR: failed to load score JSONs: {e}", file=sys.stderr)
105
+ return 3
106
+
107
+ p_overall = float(prev.get("overall_score", 0))
108
+ c_overall = float(curr.get("overall_score", 0))
109
+ overall_delta = c_overall - p_overall
110
+
111
+ p_axes = prev.get("axis_scores") or {}
112
+ c_axes = curr.get("axis_scores") or {}
113
+ deltas: dict[str, float] = {}
114
+ for ax in AXES:
115
+ ps = float((p_axes.get(ax) or {}).get("score", 0))
116
+ cs = float((c_axes.get(ax) or {}).get("score", 0))
117
+ deltas[ax] = cs - ps
118
+ net_subaxis = sum(deltas.values())
119
+
120
+ # --- Primary accept/revert decision ---
121
+ if c_overall > p_overall:
122
+ decision = "ACCEPT_IMPROVED"
123
+ exit_code = 0
124
+ elif c_overall == p_overall:
125
+ if net_subaxis >= 0:
126
+ decision = "ACCEPT_TIED_NON_NEGATIVE"
127
+ exit_code = 0
128
+ else:
129
+ decision = "REVERT_TIED_NEGATIVE_SUBAXIS"
130
+ exit_code = 2
131
+ else:
132
+ decision = "REVERT_OVERALL_DECREASED"
133
+ exit_code = 1
134
+
135
+ # --- Plateau early-stop (only applies to accepted iterations) ---
136
+ is_small_delta = overall_delta < args.plateau_threshold
137
+ new_consecutive_small = (args.consecutive_small + 1) if is_small_delta else 0
138
+ plateau_triggered = False
139
+
140
+ if exit_code == 0 and new_consecutive_small >= args.plateau_streak:
141
+ decision = "HALT_PLATEAU"
142
+ exit_code = 4
143
+ plateau_triggered = True
144
+
145
+ out = {
146
+ "decision": decision,
147
+ "exit_code": exit_code,
148
+ "overall_prev": p_overall,
149
+ "overall_curr": c_overall,
150
+ "overall_delta": overall_delta,
151
+ "subaxis_deltas": deltas,
152
+ "net_subaxis": net_subaxis,
153
+ "is_small_delta": is_small_delta,
154
+ "consecutive_small": new_consecutive_small,
155
+ "plateau_threshold": args.plateau_threshold,
156
+ "plateau_streak": args.plateau_streak,
157
+ "plateau_triggered": plateau_triggered,
158
+ }
159
+ print(json.dumps(out, indent=2))
160
+ return exit_code
161
+
162
+
163
+ if __name__ == "__main__":
164
+ sys.exit(main())
.scider/skills/content-refinement-agent/scripts/snapshot.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ snapshot.py — Copy a paper.tex (and optionally paper.pdf) into a refinement
4
+ iteration directory, so reverts are real, not symbolic.
5
+
6
+ The PaperOrchestra refinement halt rules require the loop to roll back to
7
+ the previous iteration on overall-score decrease or tied negative sub-axis
8
+ delta. To do that physically, every iteration's draft must be preserved.
9
+
10
+ Usage:
11
+ python snapshot.py --src paper.tex --dst iter2/
12
+ python snapshot.py --src paper.tex --src-pdf paper.pdf --dst iter2/
13
+ """
14
+ import argparse
15
+ import os
16
+ import shutil
17
+ import sys
18
+
19
+
20
+ def main() -> int:
21
+ p = argparse.ArgumentParser(description=__doc__)
22
+ p.add_argument("--src", required=True, help="source paper.tex path")
23
+ p.add_argument("--src-pdf", help="optional source paper.pdf path")
24
+ p.add_argument("--dst", required=True, help="destination iteration directory")
25
+ args = p.parse_args()
26
+
27
+ if not os.path.isfile(args.src):
28
+ print(f"ERROR: {args.src} not found", file=sys.stderr)
29
+ return 1
30
+
31
+ os.makedirs(args.dst, exist_ok=True)
32
+ dst_tex = os.path.join(args.dst, "paper.tex")
33
+ shutil.copy2(args.src, dst_tex)
34
+ print(f"OK: snapshot {args.src} → {dst_tex}")
35
+
36
+ if args.src_pdf:
37
+ if not os.path.isfile(args.src_pdf):
38
+ print(f"WARN: {args.src_pdf} not found, skipping PDF snapshot", file=sys.stderr)
39
+ else:
40
+ dst_pdf = os.path.join(args.dst, "paper.pdf")
41
+ shutil.copy2(args.src_pdf, dst_pdf)
42
+ print(f"OK: snapshot {args.src_pdf} → {dst_pdf}")
43
+ return 0
44
+
45
+
46
+ if __name__ == "__main__":
47
+ sys.exit(main())
.scider/skills/exploratory-data-analysis/SKILL.md ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: exploratory-data-analysis
3
+ description: Comprehensive EDA on scientific data files — structure, content, quality, and characteristics analysis across 200+ formats. Use when analyzing any data file to understand its structure, quality, and downstream analysis recommendations.
4
+ allowed_agents: [data]
5
+ preload_for: [data]
6
+ ---
7
+
8
+ # Exploratory Data Analysis
9
+
10
+ ## Overview
11
+
12
+ Perform comprehensive exploratory data analysis (EDA) on scientific data files across multiple domains. This skill provides automated file type detection, format-specific analysis, data quality assessment, and generates detailed markdown reports suitable for documentation and downstream analysis planning.
13
+
14
+ **Key Capabilities:**
15
+ - Automatic detection and analysis of 200+ scientific file formats
16
+ - Comprehensive format-specific metadata extraction
17
+ - Data quality and integrity assessment
18
+ - Statistical summaries and distributions
19
+ - Visualization recommendations
20
+ - Downstream analysis suggestions
21
+ - Markdown report generation
22
+
23
+ ## When to Use This Skill
24
+
25
+ Use this skill when:
26
+ - User provides a path to a scientific data file for analysis
27
+ - User asks to "explore", "analyze", or "summarize" a data file
28
+ - User wants to understand the structure and content of scientific data
29
+ - User needs a comprehensive report of a dataset before analysis
30
+ - User wants to assess data quality or completeness
31
+ - User asks what type of analysis is appropriate for a file
32
+
33
+ ## Supported File Categories
34
+
35
+ The skill has comprehensive coverage of scientific file formats organized into six major categories:
36
+
37
+ ### 1. Chemistry and Molecular Formats (60+ extensions)
38
+ Structure files, computational chemistry outputs, molecular dynamics trajectories, and chemical databases.
39
+
40
+ **File types include:** `.pdb`, `.cif`, `.mol`, `.mol2`, `.sdf`, `.xyz`, `.smi`, `.gro`, `.log`, `.fchk`, `.cube`, `.dcd`, `.xtc`, `.trr`, `.prmtop`, `.psf`, and more.
41
+
42
+ **Reference file:** `references/chemistry_molecular_formats.md`
43
+
44
+ ### 2. Bioinformatics and Genomics Formats (50+ extensions)
45
+ Sequence data, alignments, annotations, variants, and expression data.
46
+
47
+ **File types include:** `.fasta`, `.fastq`, `.sam`, `.bam`, `.vcf`, `.bed`, `.gff`, `.gtf`, `.bigwig`, `.h5ad`, `.loom`, `.counts`, `.mtx`, and more.
48
+
49
+ **Reference file:** `references/bioinformatics_genomics_formats.md`
50
+
51
+ ### 3. Microscopy and Imaging Formats (45+ extensions)
52
+ Microscopy images, medical imaging, whole slide imaging, and electron microscopy.
53
+
54
+ **File types include:** `.tif`, `.nd2`, `.lif`, `.czi`, `.ims`, `.dcm`, `.nii`, `.mrc`, `.dm3`, `.vsi`, `.svs`, `.ome.tiff`, and more.
55
+
56
+ **Reference file:** `references/microscopy_imaging_formats.md`
57
+
58
+ ### 4. Spectroscopy and Analytical Chemistry Formats (35+ extensions)
59
+ NMR, mass spectrometry, IR/Raman, UV-Vis, X-ray, chromatography, and other analytical techniques.
60
+
61
+ **File types include:** `.fid`, `.mzML`, `.mzXML`, `.raw`, `.mgf`, `.spc`, `.jdx`, `.xy`, `.cif` (crystallography), `.wdf`, and more.
62
+
63
+ **Reference file:** `references/spectroscopy_analytical_formats.md`
64
+
65
+ ### 5. Proteomics and Metabolomics Formats (30+ extensions)
66
+ Mass spec proteomics, metabolomics, lipidomics, and multi-omics data.
67
+
68
+ **File types include:** `.mzML`, `.pepXML`, `.protXML`, `.mzid`, `.mzTab`, `.sky`, `.mgf`, `.msp`, `.h5ad`, and more.
69
+
70
+ **Reference file:** `references/proteomics_metabolomics_formats.md`
71
+
72
+ ### 6. General Scientific Data Formats (30+ extensions)
73
+ Arrays, tables, hierarchical data, compressed archives, and common scientific formats.
74
+
75
+ **File types include:** `.npy`, `.npz`, `.csv`, `.xlsx`, `.json`, `.hdf5`, `.zarr`, `.parquet`, `.mat`, `.fits`, `.nc`, `.xml`, and more.
76
+
77
+ **Reference file:** `references/general_scientific_formats.md`
78
+
79
+ ## Workflow
80
+
81
+ ### Step 1: File Type Detection
82
+
83
+ When a user provides a file path, first identify the file type:
84
+
85
+ 1. Extract the file extension
86
+ 2. Look up the extension in the appropriate reference file
87
+ 3. Identify the file category and format description
88
+ 4. Load format-specific information
89
+
90
+ **Example:**
91
+ ```
92
+ User: "Analyze data.fastq"
93
+ → Extension: .fastq
94
+ → Category: bioinformatics_genomics
95
+ → Format: FASTQ Format (sequence data with quality scores)
96
+ → Reference: references/bioinformatics_genomics_formats.md
97
+ ```
98
+
99
+ ### Step 2: Load Format-Specific Information
100
+
101
+ Based on the file type, read the corresponding reference file to understand:
102
+ - **Typical Data:** What kind of data this format contains
103
+ - **Use Cases:** Common applications for this format
104
+ - **Python Libraries:** How to read the file in Python
105
+ - **EDA Approach:** What analyses are appropriate for this data type
106
+
107
+ Search the reference file for the specific extension (e.g., search for "### .fastq" in `bioinformatics_genomics_formats.md`).
108
+
109
+ ### Step 3: Perform Data Analysis
110
+
111
+ Use the `scripts/eda_analyzer.py` script OR implement custom analysis:
112
+
113
+ **Option A: Use the analyzer script**
114
+ ```python
115
+ # The script automatically:
116
+ # 1. Detects file type
117
+ # 2. Loads reference information
118
+ # 3. Performs format-specific analysis
119
+ # 4. Generates markdown report
120
+
121
+ python scripts/eda_analyzer.py <filepath> [output.md]
122
+ ```
123
+
124
+ **Option B: Custom analysis in the conversation**
125
+ Based on the format information from the reference file, perform appropriate analysis:
126
+
127
+ For tabular data (CSV, TSV, Excel):
128
+ - Load with pandas
129
+ - Check dimensions, data types
130
+ - Analyze missing values
131
+ - Calculate summary statistics
132
+ - Identify outliers
133
+ - Check for duplicates
134
+
135
+ For sequence data (FASTA, FASTQ):
136
+ - Count sequences
137
+ - Analyze length distributions
138
+ - Calculate GC content
139
+ - Assess quality scores (FASTQ)
140
+
141
+ For images (TIFF, ND2, CZI):
142
+ - Check dimensions (X, Y, Z, C, T)
143
+ - Analyze bit depth and value range
144
+ - Extract metadata (channels, timestamps, spatial calibration)
145
+ - Calculate intensity statistics
146
+
147
+ For arrays (NPY, HDF5):
148
+ - Check shape and dimensions
149
+ - Analyze data type
150
+ - Calculate statistical summaries
151
+ - Check for missing/invalid values
152
+
153
+ ### Step 4: Generate Comprehensive Report
154
+
155
+ Create a markdown report with the following sections:
156
+
157
+ #### Required Sections:
158
+ 1. **Title and Metadata**
159
+ - Filename and timestamp
160
+ - File size and location
161
+
162
+ 2. **Basic Information**
163
+ - File properties
164
+ - Format identification
165
+
166
+ 3. **File Type Details**
167
+ - Format description from reference
168
+ - Typical data content
169
+ - Common use cases
170
+ - Python libraries for reading
171
+
172
+ 4. **Data Analysis**
173
+ - Structure and dimensions
174
+ - Statistical summaries
175
+ - Quality assessment
176
+ - Data characteristics
177
+
178
+ 5. **Key Findings**
179
+ - Notable patterns
180
+ - Potential issues
181
+ - Quality metrics
182
+
183
+ 6. **Recommendations**
184
+ - Preprocessing steps
185
+ - Appropriate analyses
186
+ - Tools and methods
187
+ - Visualization approaches
188
+
189
+ #### Template Location
190
+ Use `assets/report_template.md` as a guide for report structure.
191
+
192
+ ### Step 5: Save Report
193
+
194
+ Save the markdown report with a descriptive filename:
195
+ - Pattern: `{original_filename}_eda_report.md`
196
+ - Example: `experiment_data.fastq` → `experiment_data_eda_report.md`
197
+
198
+ ## Detailed Format References
199
+
200
+ Each reference file contains comprehensive information for dozens of file types. To find information about a specific format:
201
+
202
+ 1. Identify the category from the extension
203
+ 2. Read the appropriate reference file
204
+ 3. Search for the section heading matching the extension (e.g., "### .pdb")
205
+ 4. Extract the format information
206
+
207
+ ### Reference File Structure
208
+
209
+ Each format entry includes:
210
+ - **Description:** What the format is
211
+ - **Typical Data:** What it contains
212
+ - **Use Cases:** Common applications
213
+ - **Python Libraries:** How to read it (with code examples)
214
+ - **EDA Approach:** Specific analyses to perform
215
+
216
+ **Example lookup:**
217
+ ```markdown
218
+ ### .pdb - Protein Data Bank
219
+ **Description:** Standard format for 3D structures of biological macromolecules
220
+ **Typical Data:** Atomic coordinates, residue information, secondary structure
221
+ **Use Cases:** Protein structure analysis, molecular visualization, docking
222
+ **Python Libraries:**
223
+ - `Biopython`: `Bio.PDB`
224
+ - `MDAnalysis`: `MDAnalysis.Universe('file.pdb')`
225
+ **EDA Approach:**
226
+ - Structure validation (bond lengths, angles)
227
+ - B-factor distribution
228
+ - Missing residues detection
229
+ - Ramachandran plots
230
+ ```
231
+
232
+ ## Best Practices
233
+
234
+ ### Reading Reference Files
235
+
236
+ Reference files are large (10,000+ words each). To efficiently use them:
237
+
238
+ 1. **Search by extension:** Use grep to find the specific format
239
+ ```python
240
+ import re
241
+ with open('references/chemistry_molecular_formats.md', 'r') as f:
242
+ content = f.read()
243
+ pattern = r'### \.pdb[^#]*?(?=###|\Z)'
244
+ match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
245
+ ```
246
+
247
+ 2. **Extract relevant sections:** Don't load entire reference files into context unnecessarily
248
+
249
+ 3. **Cache format info:** If analyzing multiple files of the same type, reuse the format information
250
+
251
+ ### Data Analysis
252
+
253
+ 1. **Sample large files:** For files with millions of records, analyze a representative sample
254
+ 2. **Handle errors gracefully:** Many scientific formats require specific libraries; provide clear installation instructions
255
+ 3. **Validate metadata:** Cross-check metadata consistency (e.g., stated dimensions vs actual data)
256
+ 4. **Consider data provenance:** Note instrument, software versions, processing steps
257
+
258
+ ### Report Generation
259
+
260
+ 1. **Be comprehensive:** Include all relevant information for downstream analysis
261
+ 2. **Be specific:** Provide concrete recommendations based on the file type
262
+ 3. **Be actionable:** Suggest specific next steps and tools
263
+ 4. **Include code examples:** Show how to load and work with the data
264
+
265
+ ## Examples
266
+
267
+ ### Example 1: Analyzing a FASTQ file
268
+
269
+ ```python
270
+ # User provides: "Analyze reads.fastq"
271
+
272
+ # 1. Detect file type
273
+ extension = '.fastq'
274
+ category = 'bioinformatics_genomics'
275
+
276
+ # 2. Read reference info
277
+ # Search references/bioinformatics_genomics_formats.md for "### .fastq"
278
+
279
+ # 3. Perform analysis
280
+ from Bio import SeqIO
281
+ sequences = list(SeqIO.parse('reads.fastq', 'fastq'))
282
+ # Calculate: read count, length distribution, quality scores, GC content
283
+
284
+ # 4. Generate report
285
+ # Include: format description, analysis results, QC recommendations
286
+
287
+ # 5. Save as: reads_eda_report.md
288
+ ```
289
+
290
+ ### Example 2: Analyzing a CSV dataset
291
+
292
+ ```python
293
+ # User provides: "Explore experiment_results.csv"
294
+
295
+ # 1. Detect: .csv → general_scientific
296
+
297
+ # 2. Load reference for CSV format
298
+
299
+ # 3. Analyze
300
+ import pandas as pd
301
+ df = pd.read_csv('experiment_results.csv')
302
+ # Dimensions, dtypes, missing values, statistics, correlations
303
+
304
+ # 4. Generate report with:
305
+ # - Data structure
306
+ # - Missing value patterns
307
+ # - Statistical summaries
308
+ # - Correlation matrix
309
+ # - Outlier detection results
310
+
311
+ # 5. Save report
312
+ ```
313
+
314
+ ### Example 3: Analyzing microscopy data
315
+
316
+ ```python
317
+ # User provides: "Analyze cells.nd2"
318
+
319
+ # 1. Detect: .nd2 → microscopy_imaging (Nikon format)
320
+
321
+ # 2. Read reference for ND2 format
322
+ # Learn: multi-dimensional (XYZCT), requires nd2reader
323
+
324
+ # 3. Analyze
325
+ from nd2reader import ND2Reader
326
+ with ND2Reader('cells.nd2') as images:
327
+ # Extract: dimensions, channels, timepoints, metadata
328
+ # Calculate: intensity statistics, frame info
329
+
330
+ # 4. Generate report with:
331
+ # - Image dimensions (XY, Z-stacks, time, channels)
332
+ # - Channel wavelengths
333
+ # - Pixel size and calibration
334
+ # - Recommendations for image analysis
335
+
336
+ # 5. Save report
337
+ ```
338
+
339
+ ## Troubleshooting
340
+
341
+ ### Missing Libraries
342
+
343
+ Many scientific formats require specialized libraries:
344
+
345
+ **Problem:** Import error when trying to read a file
346
+
347
+ **Solution:** Provide clear installation instructions
348
+ ```python
349
+ try:
350
+ from Bio import SeqIO
351
+ except ImportError:
352
+ print("Install Biopython: uv pip install biopython")
353
+ ```
354
+
355
+ Common requirements by category:
356
+ - **Bioinformatics:** `biopython`, `pysam`, `pyBigWig`
357
+ - **Chemistry:** `rdkit`, `mdanalysis`, `cclib`
358
+ - **Microscopy:** `tifffile`, `nd2reader`, `aicsimageio`, `pydicom`
359
+ - **Spectroscopy:** `nmrglue`, `pymzml`, `pyteomics`
360
+ - **General:** `pandas`, `numpy`, `h5py`, `scipy`
361
+
362
+ ### Unknown File Types
363
+
364
+ If a file extension is not in the references:
365
+
366
+ 1. Ask the user about the file format
367
+ 2. Check if it's a vendor-specific variant
368
+ 3. Attempt generic analysis based on file structure (text vs binary)
369
+ 4. Provide general recommendations
370
+
371
+ ### Large Files
372
+
373
+ For very large files:
374
+
375
+ 1. Use sampling strategies (first N records)
376
+ 2. Use memory-mapped access (for HDF5, NPY)
377
+ 3. Process in chunks (for CSV, FASTQ)
378
+ 4. Provide estimates based on samples
379
+
380
+ ## Script Usage
381
+
382
+ The `scripts/eda_analyzer.py` can be used directly:
383
+
384
+ ```bash
385
+ # Basic usage
386
+ python scripts/eda_analyzer.py data.csv
387
+
388
+ # Specify output file
389
+ python scripts/eda_analyzer.py data.csv output_report.md
390
+
391
+ # The script will:
392
+ # 1. Auto-detect file type
393
+ # 2. Load format references
394
+ # 3. Perform appropriate analysis
395
+ # 4. Generate markdown report
396
+ ```
397
+
398
+ The script supports automatic analysis for many common formats, but custom analysis in the conversation provides more flexibility and domain-specific insights.
399
+
400
+ ## Advanced Usage
401
+
402
+ ### Multi-File Analysis
403
+
404
+ When analyzing multiple related files:
405
+ 1. Perform individual EDA on each file
406
+ 2. Create a summary comparison report
407
+ 3. Identify relationships and dependencies
408
+ 4. Suggest integration strategies
409
+
410
+ ### Quality Control
411
+
412
+ For data quality assessment:
413
+ 1. Check format compliance
414
+ 2. Validate metadata consistency
415
+ 3. Assess completeness
416
+ 4. Identify outliers and anomalies
417
+ 5. Compare to expected ranges/distributions
418
+
419
+ ### Preprocessing Recommendations
420
+
421
+ Based on data characteristics, recommend:
422
+ 1. Normalization strategies
423
+ 2. Missing value imputation
424
+ 3. Outlier handling
425
+ 4. Batch correction
426
+ 5. Format conversions
427
+
428
+ ## Resources
429
+
430
+ ### scripts/
431
+ - `eda_analyzer.py`: Comprehensive analysis script that can be run directly or imported
432
+
433
+ ### references/
434
+ - `chemistry_molecular_formats.md`: 60+ chemistry/molecular file formats
435
+ - `bioinformatics_genomics_formats.md`: 50+ bioinformatics formats
436
+ - `microscopy_imaging_formats.md`: 45+ imaging formats
437
+ - `spectroscopy_analytical_formats.md`: 35+ spectroscopy formats
438
+ - `proteomics_metabolomics_formats.md`: 30+ omics formats
439
+ - `general_scientific_formats.md`: 30+ general formats
440
+
441
+ ### assets/
442
+ - `report_template.md`: Comprehensive markdown template for EDA reports
.scider/skills/exploratory-data-analysis/assets/report_template.md ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exploratory Data Analysis Report: {FILENAME}
2
+
3
+ **Generated:** {TIMESTAMP}
4
+
5
+ ---
6
+
7
+ ## Executive Summary
8
+
9
+ This report provides a comprehensive exploratory data analysis of the file `{FILENAME}`. The analysis includes file type identification, format-specific metadata extraction, data quality assessment, and recommendations for downstream analysis.
10
+
11
+ ---
12
+
13
+ ## Basic Information
14
+
15
+ - **Filename:** `{FILENAME}`
16
+ - **Full Path:** `{FILEPATH}`
17
+ - **File Size:** {FILE_SIZE_HUMAN} ({FILE_SIZE_BYTES} bytes)
18
+ - **Last Modified:** {MODIFIED_DATE}
19
+ - **Extension:** `.{EXTENSION}`
20
+ - **Format Category:** {CATEGORY}
21
+
22
+ ---
23
+
24
+ ## File Type Details
25
+
26
+ ### Format Description
27
+ {FORMAT_DESCRIPTION}
28
+
29
+ ### Typical Data Content
30
+ {TYPICAL_DATA}
31
+
32
+ ### Common Use Cases
33
+ {USE_CASES}
34
+
35
+ ### Python Libraries for Reading
36
+ {PYTHON_LIBRARIES}
37
+
38
+ ---
39
+
40
+ ## Data Structure Analysis
41
+
42
+ ### Overview
43
+ {DATA_STRUCTURE_OVERVIEW}
44
+
45
+ ### Dimensions
46
+ {DIMENSIONS}
47
+
48
+ ### Data Types
49
+ {DATA_TYPES}
50
+
51
+ ---
52
+
53
+ ## Quality Assessment
54
+
55
+ ### Completeness
56
+ - **Missing Values:** {MISSING_VALUES}
57
+ - **Data Coverage:** {COVERAGE}
58
+
59
+ ### Validity
60
+ - **Range Check:** {RANGE_CHECK}
61
+ - **Format Compliance:** {FORMAT_COMPLIANCE}
62
+ - **Consistency:** {CONSISTENCY}
63
+
64
+ ### Integrity
65
+ - **Checksum/Validation:** {VALIDATION}
66
+ - **File Corruption Check:** {CORRUPTION_CHECK}
67
+
68
+ ---
69
+
70
+ ## Statistical Summary
71
+
72
+ ### Numerical Variables
73
+ {NUMERICAL_STATS}
74
+
75
+ ### Categorical Variables
76
+ {CATEGORICAL_STATS}
77
+
78
+ ### Distributions
79
+ {DISTRIBUTIONS}
80
+
81
+ ---
82
+
83
+ ## Data Characteristics
84
+
85
+ ### Temporal Properties (if applicable)
86
+ - **Time Range:** {TIME_RANGE}
87
+ - **Sampling Rate:** {SAMPLING_RATE}
88
+ - **Missing Time Points:** {MISSING_TIMEPOINTS}
89
+
90
+ ### Spatial Properties (if applicable)
91
+ - **Dimensions:** {SPATIAL_DIMENSIONS}
92
+ - **Resolution:** {SPATIAL_RESOLUTION}
93
+ - **Coordinate System:** {COORDINATE_SYSTEM}
94
+
95
+ ### Experimental Metadata (if applicable)
96
+ - **Instrument:** {INSTRUMENT}
97
+ - **Method:** {METHOD}
98
+ - **Sample Info:** {SAMPLE_INFO}
99
+
100
+ ---
101
+
102
+ ## Key Findings
103
+
104
+ 1. **Data Volume:** {DATA_VOLUME_FINDING}
105
+ 2. **Data Quality:** {DATA_QUALITY_FINDING}
106
+ 3. **Notable Patterns:** {PATTERNS_FINDING}
107
+ 4. **Potential Issues:** {ISSUES_FINDING}
108
+
109
+ ---
110
+
111
+ ## Visualizations
112
+
113
+ ### Distribution Plots
114
+ {DISTRIBUTION_PLOTS}
115
+
116
+ ### Correlation Analysis
117
+ {CORRELATION_PLOTS}
118
+
119
+ ### Time Series (if applicable)
120
+ {TIMESERIES_PLOTS}
121
+
122
+ ---
123
+
124
+ ## Recommendations for Further Analysis
125
+
126
+ ### Immediate Actions
127
+ 1. {RECOMMENDATION_1}
128
+ 2. {RECOMMENDATION_2}
129
+ 3. {RECOMMENDATION_3}
130
+
131
+ ### Preprocessing Steps
132
+ - {PREPROCESSING_1}
133
+ - {PREPROCESSING_2}
134
+ - {PREPROCESSING_3}
135
+
136
+ ### Analytical Approaches
137
+ {ANALYTICAL_APPROACHES}
138
+
139
+ ### Tools and Methods
140
+ - **Recommended Software:** {RECOMMENDED_SOFTWARE}
141
+ - **Statistical Methods:** {STATISTICAL_METHODS}
142
+ - **Visualization Tools:** {VIZ_TOOLS}
143
+
144
+ ---
145
+
146
+ ## Data Processing Workflow
147
+
148
+ ```
149
+ {WORKFLOW_DIAGRAM}
150
+ ```
151
+
152
+ ---
153
+
154
+ ## Potential Challenges
155
+
156
+ 1. **Challenge:** {CHALLENGE_1}
157
+ - **Mitigation:** {MITIGATION_1}
158
+
159
+ 2. **Challenge:** {CHALLENGE_2}
160
+ - **Mitigation:** {MITIGATION_2}
161
+
162
+ ---
163
+
164
+ ## References and Resources
165
+
166
+ ### Format Specification
167
+ - {FORMAT_SPEC_LINK}
168
+
169
+ ### Python Libraries Documentation
170
+ - {LIBRARY_DOCS}
171
+
172
+ ### Related Analysis Examples
173
+ - {EXAMPLE_LINKS}
174
+
175
+ ---
176
+
177
+ ## Appendix
178
+
179
+ ### Complete File Metadata
180
+ ```json
181
+ {COMPLETE_METADATA}
182
+ ```
183
+
184
+ ### Analysis Parameters
185
+ ```json
186
+ {ANALYSIS_PARAMETERS}
187
+ ```
188
+
189
+ ### Software Versions
190
+ - Python: {PYTHON_VERSION}
191
+ - Key Libraries: {LIBRARY_VERSIONS}
192
+
193
+ ---
194
+
195
+ *This report was automatically generated by the exploratory-data-analysis skill.*
196
+ *For questions or issues, refer to the skill documentation.*
.scider/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bioinformatics and Genomics File Formats Reference
2
+
3
+ This reference covers file formats used in genomics, transcriptomics, sequence analysis, and related bioinformatics applications.
4
+
5
+ ## Sequence Data Formats
6
+
7
+ ### .fasta / .fa / .fna - FASTA Format
8
+ **Description:** Text-based format for nucleotide or protein sequences
9
+ **Typical Data:** DNA, RNA, or protein sequences with headers
10
+ **Use Cases:** Sequence storage, BLAST searches, alignments
11
+ **Python Libraries:**
12
+ - `Biopython`: `SeqIO.parse('file.fasta', 'fasta')`
13
+ - `pyfaidx`: Fast indexed FASTA access
14
+ - `screed`: Fast sequence parsing
15
+ **EDA Approach:**
16
+ - Sequence count and length distribution
17
+ - GC content analysis
18
+ - N content (ambiguous bases)
19
+ - Sequence ID parsing
20
+ - Duplicate detection
21
+ - Quality metrics for assemblies (N50, L50)
22
+
23
+ ### .fastq / .fq - FASTQ Format
24
+ **Description:** Sequence data with base quality scores
25
+ **Typical Data:** Raw sequencing reads with Phred quality scores
26
+ **Use Cases:** NGS data, quality control, read mapping
27
+ **Python Libraries:**
28
+ - `Biopython`: `SeqIO.parse('file.fastq', 'fastq')`
29
+ - `pysam`: Fast FASTQ/BAM operations
30
+ - `HTSeq`: Sequencing data analysis
31
+ **EDA Approach:**
32
+ - Read count and length distribution
33
+ - Quality score distribution (per-base, per-read)
34
+ - GC content and bias
35
+ - Duplicate rate estimation
36
+ - Adapter contamination detection
37
+ - k-mer frequency analysis
38
+ - Encoding format validation (Phred33/64)
39
+
40
+ ### .sam - Sequence Alignment/Map
41
+ **Description:** Tab-delimited text format for alignments
42
+ **Typical Data:** Aligned sequencing reads with mapping quality
43
+ **Use Cases:** Read alignment storage, variant calling
44
+ **Python Libraries:**
45
+ - `pysam`: `pysam.AlignmentFile('file.sam', 'r')`
46
+ - `HTSeq`: `HTSeq.SAM_Reader('file.sam')`
47
+ **EDA Approach:**
48
+ - Mapping rate and quality distribution
49
+ - Coverage analysis
50
+ - Insert size distribution (paired-end)
51
+ - Alignment flags distribution
52
+ - CIGAR string patterns
53
+ - Mismatch and indel rates
54
+ - Duplicate and supplementary alignment counts
55
+
56
+ ### .bam - Binary Alignment/Map
57
+ **Description:** Compressed binary version of SAM
58
+ **Typical Data:** Aligned reads in compressed format
59
+ **Use Cases:** Efficient storage and processing of alignments
60
+ **Python Libraries:**
61
+ - `pysam`: Full BAM support with indexing
62
+ - `bamnostic`: Pure Python BAM reader
63
+ **EDA Approach:**
64
+ - Same as SAM plus:
65
+ - Compression ratio analysis
66
+ - Index file (.bai) validation
67
+ - Chromosome-wise statistics
68
+ - Strand bias detection
69
+ - Read group analysis
70
+
71
+ ### .cram - CRAM Format
72
+ **Description:** Highly compressed alignment format
73
+ **Typical Data:** Reference-compressed aligned reads
74
+ **Use Cases:** Long-term storage, space-efficient archives
75
+ **Python Libraries:**
76
+ - `pysam`: CRAM support (requires reference)
77
+ - Reference genome must be accessible
78
+ **EDA Approach:**
79
+ - Compression efficiency vs BAM
80
+ - Reference dependency validation
81
+ - Lossy vs lossless compression assessment
82
+ - Decompression performance
83
+ - Similar alignment metrics as BAM
84
+
85
+ ### .bed - Browser Extensible Data
86
+ **Description:** Tab-delimited format for genomic features
87
+ **Typical Data:** Genomic intervals (chr, start, end) with annotations
88
+ **Use Cases:** Peak calling, variant annotation, genome browsing
89
+ **Python Libraries:**
90
+ - `pybedtools`: `pybedtools.BedTool('file.bed')`
91
+ - `pyranges`: `pyranges.read_bed('file.bed')`
92
+ - `pandas`: Simple BED reading
93
+ **EDA Approach:**
94
+ - Feature count and size distribution
95
+ - Chromosome distribution
96
+ - Strand bias
97
+ - Score distribution (if present)
98
+ - Overlap and proximity analysis
99
+ - Coverage statistics
100
+ - Gap analysis between features
101
+
102
+ ### .bedGraph - BED with Graph Data
103
+ **Description:** BED format with per-base signal values
104
+ **Typical Data:** Continuous-valued genomic data (coverage, signals)
105
+ **Use Cases:** Coverage tracks, ChIP-seq signals, methylation
106
+ **Python Libraries:**
107
+ - `pyBigWig`: Can convert to bigWig
108
+ - `pybedtools`: BedGraph operations
109
+ **EDA Approach:**
110
+ - Signal distribution statistics
111
+ - Genome coverage percentage
112
+ - Signal dynamics (peaks, valleys)
113
+ - Chromosome-wise signal patterns
114
+ - Quantile analysis
115
+ - Zero-coverage regions
116
+
117
+ ### .bigWig / .bw - Binary BigWig
118
+ **Description:** Indexed binary format for genome-wide signal data
119
+ **Typical Data:** Continuous genomic signals (compressed and indexed)
120
+ **Use Cases:** Efficient genome browser tracks, large-scale data
121
+ **Python Libraries:**
122
+ - `pyBigWig`: `pyBigWig.open('file.bw')`
123
+ - `pybbi`: BigWig/BigBed interface
124
+ **EDA Approach:**
125
+ - Signal statistics extraction
126
+ - Zoom level analysis
127
+ - Regional signal extraction
128
+ - Efficient genome-wide summaries
129
+ - Compression efficiency
130
+ - Index structure analysis
131
+
132
+ ### .bigBed / .bb - Binary BigBed
133
+ **Description:** Indexed binary BED format
134
+ **Typical Data:** Genomic features (compressed and indexed)
135
+ **Use Cases:** Large feature sets, genome browsers
136
+ **Python Libraries:**
137
+ - `pybbi`: BigBed reading
138
+ - `pybigtools`: Modern BigBed interface
139
+ **EDA Approach:**
140
+ - Feature density analysis
141
+ - Efficient interval queries
142
+ - Zoom level validation
143
+ - Index performance metrics
144
+ - Feature size statistics
145
+
146
+ ### .gff / .gff3 - General Feature Format
147
+ **Description:** Tab-delimited format for genomic annotations
148
+ **Typical Data:** Gene models, transcripts, exons, regulatory elements
149
+ **Use Cases:** Genome annotation, gene prediction
150
+ **Python Libraries:**
151
+ - `BCBio.GFF`: Biopython GFF module
152
+ - `gffutils`: `gffutils.create_db('file.gff3')`
153
+ - `pyranges`: GFF support
154
+ **EDA Approach:**
155
+ - Feature type distribution (gene, exon, CDS, etc.)
156
+ - Gene structure validation
157
+ - Strand balance
158
+ - Hierarchical relationship validation
159
+ - Phase validation for CDS
160
+ - Attribute completeness
161
+ - Gene model statistics (introns, exons per gene)
162
+
163
+ ### .gtf - Gene Transfer Format
164
+ **Description:** GFF2-based format for gene annotations
165
+ **Typical Data:** Gene and transcript annotations
166
+ **Use Cases:** RNA-seq analysis, gene quantification
167
+ **Python Libraries:**
168
+ - `pyranges`: `pyranges.read_gtf('file.gtf')`
169
+ - `gffutils`: GTF database creation
170
+ - `HTSeq`: GTF reading for counts
171
+ **EDA Approach:**
172
+ - Transcript isoform analysis
173
+ - Gene structure completeness
174
+ - Exon number distribution
175
+ - Transcript length distribution
176
+ - TSS and TES analysis
177
+ - Biotype distribution
178
+ - Overlapping gene detection
179
+
180
+ ### .vcf - Variant Call Format
181
+ **Description:** Text format for genetic variants
182
+ **Typical Data:** SNPs, indels, structural variants with annotations
183
+ **Use Cases:** Variant calling, population genetics, GWAS
184
+ **Python Libraries:**
185
+ - `pysam`: `pysam.VariantFile('file.vcf')`
186
+ - `cyvcf2`: Fast VCF parsing
187
+ - `PyVCF`: Older but comprehensive
188
+ **EDA Approach:**
189
+ - Variant count by type (SNP, indel, SV)
190
+ - Quality score distribution
191
+ - Allele frequency spectrum
192
+ - Transition/transversion ratio
193
+ - Heterozygosity rates
194
+ - Missing genotype analysis
195
+ - Hardy-Weinberg equilibrium
196
+ - Annotation completeness (if annotated)
197
+
198
+ ### .bcf - Binary VCF
199
+ **Description:** Compressed binary variant format
200
+ **Typical Data:** Same as VCF but binary
201
+ **Use Cases:** Efficient variant storage and processing
202
+ **Python Libraries:**
203
+ - `pysam`: Full BCF support
204
+ - `cyvcf2`: Optimized BCF reading
205
+ **EDA Approach:**
206
+ - Same as VCF plus:
207
+ - Compression efficiency
208
+ - Indexing validation
209
+ - Read performance metrics
210
+
211
+ ### .gvcf - Genomic VCF
212
+ **Description:** VCF with reference confidence blocks
213
+ **Typical Data:** All positions (variant and non-variant)
214
+ **Use Cases:** Joint genotyping workflows, GATK
215
+ **Python Libraries:**
216
+ - `pysam`: GVCF support
217
+ - Standard VCF parsers
218
+ **EDA Approach:**
219
+ - Reference block analysis
220
+ - Coverage uniformity
221
+ - Variant density
222
+ - Genotype quality across genome
223
+ - Reference confidence distribution
224
+
225
+ ## RNA-Seq and Expression Data
226
+
227
+ ### .counts - Gene Count Matrix
228
+ **Description:** Tab-delimited gene expression counts
229
+ **Typical Data:** Gene IDs with read counts per sample
230
+ **Use Cases:** RNA-seq quantification, differential expression
231
+ **Python Libraries:**
232
+ - `pandas`: `pd.read_csv('file.counts', sep='\t')`
233
+ - `scanpy` (for single-cell): `sc.read_csv()`
234
+ **EDA Approach:**
235
+ - Library size distribution
236
+ - Detection rate (genes per sample)
237
+ - Zero-inflation analysis
238
+ - Count distribution (log scale)
239
+ - Outlier sample detection
240
+ - Correlation between replicates
241
+ - PCA for sample relationships
242
+
243
+ ### .tpm / .fpkm - Normalized Expression
244
+ **Description:** Normalized gene expression values
245
+ **Typical Data:** TPM (transcripts per million) or FPKM values
246
+ **Use Cases:** Cross-sample comparison, visualization
247
+ **Python Libraries:**
248
+ - `pandas`: Standard CSV reading
249
+ - `anndata`: For integrated analysis
250
+ **EDA Approach:**
251
+ - Expression distribution
252
+ - Highly expressed gene identification
253
+ - Sample clustering
254
+ - Batch effect detection
255
+ - Coefficient of variation analysis
256
+ - Dynamic range assessment
257
+
258
+ ### .mtx - Matrix Market Format
259
+ **Description:** Sparse matrix format (common in single-cell)
260
+ **Typical Data:** Sparse count matrices (cells × genes)
261
+ **Use Cases:** Single-cell RNA-seq, large sparse matrices
262
+ **Python Libraries:**
263
+ - `scipy.io`: `scipy.io.mmread('file.mtx')`
264
+ - `scanpy`: `sc.read_mtx('file.mtx')`
265
+ **EDA Approach:**
266
+ - Sparsity analysis
267
+ - Cell and gene filtering thresholds
268
+ - Doublet detection metrics
269
+ - Mitochondrial fraction
270
+ - UMI count distribution
271
+ - Gene detection per cell
272
+
273
+ ### .h5ad - Anndata Format
274
+ **Description:** HDF5-based annotated data matrix
275
+ **Typical Data:** Expression matrix with metadata (cells, genes)
276
+ **Use Cases:** Single-cell RNA-seq analysis with Scanpy
277
+ **Python Libraries:**
278
+ - `scanpy`: `sc.read_h5ad('file.h5ad')`
279
+ - `anndata`: Direct AnnData manipulation
280
+ **EDA Approach:**
281
+ - Cell and gene counts
282
+ - Metadata completeness
283
+ - Layer availability (raw, normalized)
284
+ - Embedding presence (PCA, UMAP)
285
+ - QC metrics distribution
286
+ - Batch information
287
+ - Cell type annotation coverage
288
+
289
+ ### .loom - Loom Format
290
+ **Description:** HDF5-based format for omics data
291
+ **Typical Data:** Expression matrices with metadata
292
+ **Use Cases:** Single-cell data, RNA velocity analysis
293
+ **Python Libraries:**
294
+ - `loompy`: `loompy.connect('file.loom')`
295
+ - `scanpy`: Can import loom files
296
+ **EDA Approach:**
297
+ - Layer analysis (spliced, unspliced)
298
+ - Row and column attribute exploration
299
+ - Graph connectivity analysis
300
+ - Cluster assignments
301
+ - Velocity-specific metrics
302
+
303
+ ### .rds - R Data Serialization
304
+ **Description:** R object storage (often Seurat objects)
305
+ **Typical Data:** R analysis results, especially single-cell
306
+ **Use Cases:** R-Python data exchange
307
+ **Python Libraries:**
308
+ - `pyreadr`: `pyreadr.read_r('file.rds')`
309
+ - `rpy2`: For full R integration
310
+ - Conversion tools to AnnData
311
+ **EDA Approach:**
312
+ - Object type identification
313
+ - Data structure exploration
314
+ - Metadata extraction
315
+ - Conversion validation
316
+
317
+ ## Alignment and Assembly Formats
318
+
319
+ ### .maf - Multiple Alignment Format
320
+ **Description:** Text format for multiple sequence alignments
321
+ **Typical Data:** Genome-wide or local multiple alignments
322
+ **Use Cases:** Comparative genomics, conservation analysis
323
+ **Python Libraries:**
324
+ - `Biopython`: `AlignIO.parse('file.maf', 'maf')`
325
+ - `bx-python`: MAF-specific tools
326
+ **EDA Approach:**
327
+ - Alignment block statistics
328
+ - Species coverage
329
+ - Gap analysis
330
+ - Conservation scoring
331
+ - Alignment quality metrics
332
+ - Block length distribution
333
+
334
+ ### .axt - Pairwise Alignment Format
335
+ **Description:** Pairwise alignment format (UCSC)
336
+ **Typical Data:** Pairwise genomic alignments
337
+ **Use Cases:** Genome comparison, synteny analysis
338
+ **Python Libraries:**
339
+ - Custom parsers (simple format)
340
+ - `bx-python`: AXT support
341
+ **EDA Approach:**
342
+ - Alignment score distribution
343
+ - Identity percentage
344
+ - Syntenic block identification
345
+ - Gap size analysis
346
+ - Coverage statistics
347
+
348
+ ### .chain - Chain Alignment Format
349
+ **Description:** Genome coordinate mapping chains
350
+ **Typical Data:** Coordinate transformations between genome builds
351
+ **Use Cases:** Liftover, coordinate conversion
352
+ **Python Libraries:**
353
+ - `pyliftover`: Chain file usage
354
+ - Custom parsers for chain format
355
+ **EDA Approach:**
356
+ - Chain score distribution
357
+ - Coverage of source genome
358
+ - Gap analysis
359
+ - Inversion detection
360
+ - Mapping quality assessment
361
+
362
+ ### .psl - Pattern Space Layout
363
+ **Description:** BLAT/BLAST alignment format
364
+ **Typical Data:** Alignment results from BLAT
365
+ **Use Cases:** Transcript mapping, similarity searches
366
+ **Python Libraries:**
367
+ - Custom parsers (tab-delimited)
368
+ - `pybedtools`: Can handle PSL
369
+ **EDA Approach:**
370
+ - Match percentage distribution
371
+ - Gap statistics
372
+ - Query coverage
373
+ - Multiple mapping analysis
374
+ - Alignment quality metrics
375
+
376
+ ## Genome Assembly and Annotation
377
+
378
+ ### .agp - Assembly Golden Path
379
+ **Description:** Assembly structure description
380
+ **Typical Data:** Scaffold composition, gap information
381
+ **Use Cases:** Genome assembly representation
382
+ **Python Libraries:**
383
+ - Custom parsers (simple tab-delimited)
384
+ - Assembly analysis tools
385
+ **EDA Approach:**
386
+ - Scaffold statistics (N50, L50)
387
+ - Gap type and size distribution
388
+ - Component length analysis
389
+ - Assembly contiguity metrics
390
+ - Unplaced contig analysis
391
+
392
+ ### .scaffolds / .contigs - Assembly Sequences
393
+ **Description:** Assembled sequences (usually FASTA)
394
+ **Typical Data:** Assembled genomic sequences
395
+ **Use Cases:** Genome assembly output
396
+ **Python Libraries:**
397
+ - Same as FASTA format
398
+ - Assembly-specific tools (QUAST)
399
+ **EDA Approach:**
400
+ - Assembly statistics (N50, N90, etc.)
401
+ - Length distribution
402
+ - Coverage analysis
403
+ - Gap (N) content
404
+ - Duplication assessment
405
+ - BUSCO completeness (if annotations available)
406
+
407
+ ### .2bit - Compressed Genome Format
408
+ **Description:** UCSC compact genome format
409
+ **Typical Data:** Reference genomes (highly compressed)
410
+ **Use Cases:** Efficient genome storage and access
411
+ **Python Libraries:**
412
+ - `py2bit`: `py2bit.open('file.2bit')`
413
+ - `twobitreader`: Alternative reader
414
+ **EDA Approach:**
415
+ - Compression efficiency
416
+ - Random access performance
417
+ - Sequence extraction validation
418
+ - Masked region analysis
419
+ - N content and distribution
420
+
421
+ ### .sizes - Chromosome Sizes
422
+ **Description:** Simple format with chromosome lengths
423
+ **Typical Data:** Tab-delimited chromosome names and sizes
424
+ **Use Cases:** Genome browsers, coordinate validation
425
+ **Python Libraries:**
426
+ - Simple file reading with pandas
427
+ - Built into many genomic tools
428
+ **EDA Approach:**
429
+ - Genome size calculation
430
+ - Chromosome count
431
+ - Size distribution
432
+ - Karyotype validation
433
+ - Completeness check against reference
434
+
435
+ ## Phylogenetics and Evolution
436
+
437
+ ### .nwk / .newick - Newick Tree Format
438
+ **Description:** Parenthetical tree representation
439
+ **Typical Data:** Phylogenetic trees with branch lengths
440
+ **Use Cases:** Evolutionary analysis, tree visualization
441
+ **Python Libraries:**
442
+ - `Biopython`: `Phylo.read('file.nwk', 'newick')`
443
+ - `ete3`: `ete3.Tree('file.nwk')`
444
+ - `dendropy`: Phylogenetic computing
445
+ **EDA Approach:**
446
+ - Tree structure analysis (tips, internal nodes)
447
+ - Branch length distribution
448
+ - Tree balance metrics
449
+ - Ultrametricity check
450
+ - Bootstrap support analysis
451
+ - Topology validation
452
+
453
+ ### .nexus - Nexus Format
454
+ **Description:** Rich format for phylogenetic data
455
+ **Typical Data:** Alignments, trees, character matrices
456
+ **Use Cases:** Phylogenetic software interchange
457
+ **Python Libraries:**
458
+ - `Biopython`: Nexus support
459
+ - `dendropy`: Comprehensive Nexus handling
460
+ **EDA Approach:**
461
+ - Data block analysis
462
+ - Character type distribution
463
+ - Tree block validation
464
+ - Taxa consistency
465
+ - Command block parsing
466
+ - Format compliance checking
467
+
468
+ ### .phylip - PHYLIP Format
469
+ **Description:** Sequence alignment format (strict/relaxed)
470
+ **Typical Data:** Multiple sequence alignments
471
+ **Use Cases:** Phylogenetic analysis input
472
+ **Python Libraries:**
473
+ - `Biopython`: `AlignIO.read('file.phy', 'phylip')`
474
+ - `dendropy`: PHYLIP support
475
+ **EDA Approach:**
476
+ - Alignment dimensions
477
+ - Sequence length uniformity
478
+ - Gap position analysis
479
+ - Informative site calculation
480
+ - Format variant detection (strict vs relaxed)
481
+
482
+ ### .paml - PAML Output
483
+ **Description:** Output from PAML phylogenetic software
484
+ **Typical Data:** Evolutionary model results, dN/dS ratios
485
+ **Use Cases:** Molecular evolution analysis
486
+ **Python Libraries:**
487
+ - Custom parsers for specific PAML programs
488
+ - `Biopython`: Basic PAML parsing
489
+ **EDA Approach:**
490
+ - Model parameter extraction
491
+ - Likelihood values
492
+ - dN/dS ratio distribution
493
+ - Branch-specific results
494
+ - Convergence assessment
495
+
496
+ ## Protein and Structure Data
497
+
498
+ ### .embl - EMBL Format
499
+ **Description:** Rich sequence annotation format
500
+ **Typical Data:** Sequences with extensive annotations
501
+ **Use Cases:** Sequence databases, genome records
502
+ **Python Libraries:**
503
+ - `Biopython`: `SeqIO.read('file.embl', 'embl')`
504
+ **EDA Approach:**
505
+ - Feature annotation completeness
506
+ - Sequence length and type
507
+ - Reference information
508
+ - Cross-reference validation
509
+ - Feature overlap analysis
510
+
511
+ ### .genbank / .gb / .gbk - GenBank Format
512
+ **Description:** NCBI's sequence annotation format
513
+ **Typical Data:** Annotated sequences with features
514
+ **Use Cases:** Sequence databases, annotation transfer
515
+ **Python Libraries:**
516
+ - `Biopython`: `SeqIO.parse('file.gb', 'genbank')`
517
+ **EDA Approach:**
518
+ - Feature type distribution
519
+ - CDS analysis (start codons, stops)
520
+ - Translation validation
521
+ - Annotation completeness
522
+ - Source organism extraction
523
+ - Reference and publication info
524
+ - Locus tag consistency
525
+
526
+ ### .sff - Standard Flowgram Format
527
+ **Description:** 454/Roche sequencing data format
528
+ **Typical Data:** Raw pyrosequencing flowgrams
529
+ **Use Cases:** Legacy 454 sequencing data
530
+ **Python Libraries:**
531
+ - `Biopython`: `SeqIO.parse('file.sff', 'sff')`
532
+ - Platform-specific tools
533
+ **EDA Approach:**
534
+ - Read count and length
535
+ - Flowgram signal quality
536
+ - Key sequence detection
537
+ - Adapter trimming validation
538
+ - Quality score distribution
539
+
540
+ ### .hdf5 (Genomics Specific)
541
+ **Description:** HDF5 for genomics (10X, Hi-C, etc.)
542
+ **Typical Data:** High-throughput genomics data
543
+ **Use Cases:** 10X Genomics, spatial transcriptomics
544
+ **Python Libraries:**
545
+ - `h5py`: Low-level access
546
+ - `scanpy`: For 10X data
547
+ - `cooler`: For Hi-C data
548
+ **EDA Approach:**
549
+ - Dataset structure exploration
550
+ - Barcode statistics
551
+ - UMI counting
552
+ - Feature-barcode matrix analysis
553
+ - Spatial coordinates (if applicable)
554
+
555
+ ### .cool / .mcool - Cooler Format
556
+ **Description:** HDF5-based Hi-C contact matrices
557
+ **Typical Data:** Chromatin interaction matrices
558
+ **Use Cases:** 3D genome analysis, Hi-C data
559
+ **Python Libraries:**
560
+ - `cooler`: `cooler.Cooler('file.cool')`
561
+ - `hicstraw`: For .hic format
562
+ **EDA Approach:**
563
+ - Resolution analysis
564
+ - Contact matrix statistics
565
+ - Distance decay curves
566
+ - Compartment analysis
567
+ - TAD boundary detection
568
+ - Balance factor validation
569
+
570
+ ### .hic - Hi-C Binary Format
571
+ **Description:** Juicer binary Hi-C format
572
+ **Typical Data:** Multi-resolution Hi-C matrices
573
+ **Use Cases:** Hi-C analysis with Juicer tools
574
+ **Python Libraries:**
575
+ - `hicstraw`: `hicstraw.HiCFile('file.hic')`
576
+ - `straw`: C++ library with Python bindings
577
+ **EDA Approach:**
578
+ - Available resolutions
579
+ - Normalization methods
580
+ - Contact statistics
581
+ - Chromosomal interactions
582
+ - Quality metrics
583
+
584
+ ### .bw (ChIP-seq / ATAC-seq specific)
585
+ **Description:** BigWig files for epigenomics
586
+ **Typical Data:** Coverage or enrichment signals
587
+ **Use Cases:** ChIP-seq, ATAC-seq, DNase-seq
588
+ **Python Libraries:**
589
+ - `pyBigWig`: Standard bigWig access
590
+ **EDA Approach:**
591
+ - Peak enrichment patterns
592
+ - Background signal analysis
593
+ - Sample correlation
594
+ - Signal-to-noise ratio
595
+ - Library complexity metrics
596
+
597
+ ### .narrowPeak / .broadPeak - ENCODE Peak Formats
598
+ **Description:** BED-based formats for peaks
599
+ **Typical Data:** Peak calls with scores and p-values
600
+ **Use Cases:** ChIP-seq peak calling output
601
+ **Python Libraries:**
602
+ - `pybedtools`: BED-compatible
603
+ - Custom parsers for peak-specific fields
604
+ **EDA Approach:**
605
+ - Peak count and width distribution
606
+ - Signal value distribution
607
+ - Q-value and p-value analysis
608
+ - Peak summit analysis
609
+ - Overlap with known features
610
+ - Motif enrichment preparation
611
+
612
+ ### .wig - Wiggle Format
613
+ **Description:** Dense continuous genomic data
614
+ **Typical Data:** Coverage or signal tracks
615
+ **Use Cases:** Genome browser visualization
616
+ **Python Libraries:**
617
+ - `pyBigWig`: Can convert to bigWig
618
+ - Custom parsers for wiggle format
619
+ **EDA Approach:**
620
+ - Signal statistics
621
+ - Coverage metrics
622
+ - Format variant (fixedStep vs variableStep)
623
+ - Span parameter analysis
624
+ - Conversion efficiency to bigWig
625
+
626
+ ### .ab1 - Sanger Sequencing Trace
627
+ **Description:** Binary chromatogram format
628
+ **Typical Data:** Sanger sequencing traces
629
+ **Use Cases:** Capillary sequencing validation
630
+ **Python Libraries:**
631
+ - `Biopython`: `SeqIO.read('file.ab1', 'abi')`
632
+ - `tracy` tools: For quality assessment
633
+ **EDA Approach:**
634
+ - Base calling quality
635
+ - Trace quality scores
636
+ - Mixed base detection
637
+ - Primer and vector detection
638
+ - Read length and quality region
639
+ - Heterozygosity detection
640
+
641
+ ### .scf - Standard Chromatogram Format
642
+ **Description:** Sanger sequencing chromatogram
643
+ **Typical Data:** Base calls and confidence values
644
+ **Use Cases:** Sequencing trace analysis
645
+ **Python Libraries:**
646
+ - `Biopython`: SCF format support
647
+ **EDA Approach:**
648
+ - Similar to AB1 format
649
+ - Quality score profiles
650
+ - Peak height ratios
651
+ - Signal-to-noise metrics
652
+
653
+ ### .idx - Index Files (Generic)
654
+ **Description:** Index files for various formats
655
+ **Typical Data:** Fast random access indices
656
+ **Use Cases:** Efficient data access (BAM, VCF, etc.)
657
+ **Python Libraries:**
658
+ - Format-specific libraries handle indices
659
+ - `pysam`: Auto-handles BAI, CSI indices
660
+ **EDA Approach:**
661
+ - Index completeness validation
662
+ - Binning strategy analysis
663
+ - Access performance metrics
664
+ - Index size vs data size ratio
.scider/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chemistry and Molecular File Formats Reference
2
+
3
+ This reference covers file formats commonly used in computational chemistry, cheminformatics, molecular modeling, and related fields.
4
+
5
+ ## Structure File Formats
6
+
7
+ ### .pdb - Protein Data Bank
8
+ **Description:** Standard format for 3D structures of biological macromolecules
9
+ **Typical Data:** Atomic coordinates, residue information, secondary structure, crystal structure data
10
+ **Use Cases:** Protein structure analysis, molecular visualization, docking studies
11
+ **Python Libraries:**
12
+ - `Biopython`: `Bio.PDB`
13
+ - `MDAnalysis`: `MDAnalysis.Universe('file.pdb')`
14
+ - `PyMOL`: `pymol.cmd.load('file.pdb')`
15
+ - `ProDy`: `prody.parsePDB('file.pdb')`
16
+ **EDA Approach:**
17
+ - Structure validation (bond lengths, angles, clashes)
18
+ - Secondary structure analysis
19
+ - B-factor distribution
20
+ - Missing residues/atoms detection
21
+ - Ramachandran plots for validation
22
+ - Surface area and volume calculations
23
+
24
+ ### .cif - Crystallographic Information File
25
+ **Description:** Structured data format for crystallographic information
26
+ **Typical Data:** Unit cell parameters, atomic coordinates, symmetry operations, experimental data
27
+ **Use Cases:** Crystal structure determination, structural biology, materials science
28
+ **Python Libraries:**
29
+ - `gemmi`: `gemmi.cif.read_file('file.cif')`
30
+ - `PyCifRW`: `CifFile.ReadCif('file.cif')`
31
+ - `Biopython`: `Bio.PDB.MMCIFParser()`
32
+ **EDA Approach:**
33
+ - Data completeness check
34
+ - Resolution and quality metrics
35
+ - Unit cell parameter analysis
36
+ - Symmetry group validation
37
+ - Atomic displacement parameters
38
+ - R-factors and validation metrics
39
+
40
+ ### .mol - MDL Molfile
41
+ **Description:** Chemical structure file format by MDL/Accelrys
42
+ **Typical Data:** 2D/3D coordinates, atom types, bond orders, charges
43
+ **Use Cases:** Chemical database storage, cheminformatics, drug design
44
+ **Python Libraries:**
45
+ - `RDKit`: `Chem.MolFromMolFile('file.mol')`
46
+ - `Open Babel`: `pybel.readfile('mol', 'file.mol')`
47
+ - `ChemoPy`: For descriptor calculation
48
+ **EDA Approach:**
49
+ - Molecular property calculation (MW, logP, TPSA)
50
+ - Functional group analysis
51
+ - Ring system detection
52
+ - Stereochemistry validation
53
+ - 2D/3D coordinate consistency
54
+ - Valence and charge validation
55
+
56
+ ### .mol2 - Tripos Mol2
57
+ **Description:** Complete 3D molecular structure format with atom typing
58
+ **Typical Data:** Coordinates, SYBYL atom types, bond types, charges, substructures
59
+ **Use Cases:** Molecular docking, QSAR studies, drug discovery
60
+ **Python Libraries:**
61
+ - `RDKit`: `Chem.MolFromMol2File('file.mol2')`
62
+ - `Open Babel`: `pybel.readfile('mol2', 'file.mol2')`
63
+ - `MDAnalysis`: Can parse mol2 topology
64
+ **EDA Approach:**
65
+ - Atom type distribution
66
+ - Partial charge analysis
67
+ - Bond type statistics
68
+ - Substructure identification
69
+ - Conformational analysis
70
+ - Energy minimization status check
71
+
72
+ ### .sdf - Structure Data File
73
+ **Description:** Multi-structure file format with associated data
74
+ **Typical Data:** Multiple molecular structures with properties/annotations
75
+ **Use Cases:** Chemical databases, virtual screening, compound libraries
76
+ **Python Libraries:**
77
+ - `RDKit`: `Chem.SDMolSupplier('file.sdf')`
78
+ - `Open Babel`: `pybel.readfile('sdf', 'file.sdf')`
79
+ - `PandasTools` (RDKit): For DataFrame integration
80
+ **EDA Approach:**
81
+ - Dataset size and diversity metrics
82
+ - Property distribution analysis (MW, logP, etc.)
83
+ - Structural diversity (Tanimoto similarity)
84
+ - Missing data assessment
85
+ - Outlier detection in properties
86
+ - Scaffold analysis
87
+
88
+ ### .xyz - XYZ Coordinates
89
+ **Description:** Simple Cartesian coordinate format
90
+ **Typical Data:** Atom types and 3D coordinates
91
+ **Use Cases:** Quantum chemistry, geometry optimization, molecular dynamics
92
+ **Python Libraries:**
93
+ - `ASE`: `ase.io.read('file.xyz')`
94
+ - `Open Babel`: `pybel.readfile('xyz', 'file.xyz')`
95
+ - `cclib`: For parsing QM outputs with xyz
96
+ **EDA Approach:**
97
+ - Geometry analysis (bond lengths, angles, dihedrals)
98
+ - Center of mass calculation
99
+ - Moment of inertia
100
+ - Molecular size metrics
101
+ - Coordinate validation
102
+ - Symmetry detection
103
+
104
+ ### .smi / .smiles - SMILES String
105
+ **Description:** Line notation for chemical structures
106
+ **Typical Data:** Text representation of molecular structure
107
+ **Use Cases:** Chemical databases, literature mining, data exchange
108
+ **Python Libraries:**
109
+ - `RDKit`: `Chem.MolFromSmiles(smiles)`
110
+ - `Open Babel`: Can parse SMILES
111
+ - `DeepChem`: For ML on SMILES
112
+ **EDA Approach:**
113
+ - SMILES syntax validation
114
+ - Descriptor calculation from SMILES
115
+ - Fingerprint generation
116
+ - Substructure searching
117
+ - Tautomer enumeration
118
+ - Stereoisomer handling
119
+
120
+ ### .pdbqt - AutoDock PDBQT
121
+ **Description:** Modified PDB format for AutoDock docking
122
+ **Typical Data:** Coordinates, partial charges, atom types for docking
123
+ **Use Cases:** Molecular docking, virtual screening
124
+ **Python Libraries:**
125
+ - `Meeko`: For PDBQT preparation
126
+ - `Open Babel`: Can read PDBQT
127
+ - `ProDy`: Limited PDBQT support
128
+ **EDA Approach:**
129
+ - Charge distribution analysis
130
+ - Rotatable bond identification
131
+ - Atom type validation
132
+ - Coordinate quality check
133
+ - Hydrogen placement validation
134
+ - Torsion definition analysis
135
+
136
+ ### .mae - Maestro Format
137
+ **Description:** Schrödinger's proprietary molecular structure format
138
+ **Typical Data:** Structures, properties, annotations from Schrödinger suite
139
+ **Use Cases:** Drug discovery, molecular modeling with Schrödinger tools
140
+ **Python Libraries:**
141
+ - `schrodinger.structure`: Requires Schrödinger installation
142
+ - Custom parsers for basic reading
143
+ **EDA Approach:**
144
+ - Property extraction and analysis
145
+ - Structure quality metrics
146
+ - Conformer analysis
147
+ - Docking score distributions
148
+ - Ligand efficiency metrics
149
+
150
+ ### .gro - GROMACS Coordinate File
151
+ **Description:** Molecular structure file for GROMACS MD simulations
152
+ **Typical Data:** Atom positions, velocities, box vectors
153
+ **Use Cases:** Molecular dynamics simulations, GROMACS workflows
154
+ **Python Libraries:**
155
+ - `MDAnalysis`: `Universe('file.gro')`
156
+ - `MDTraj`: `mdtraj.load_gro('file.gro')`
157
+ - `GromacsWrapper`: For GROMACS integration
158
+ **EDA Approach:**
159
+ - System composition analysis
160
+ - Box dimension validation
161
+ - Atom position distribution
162
+ - Velocity distribution (if present)
163
+ - Density calculation
164
+ - Solvation analysis
165
+
166
+ ## Computational Chemistry Output Formats
167
+
168
+ ### .log - Gaussian Log File
169
+ **Description:** Output from Gaussian quantum chemistry calculations
170
+ **Typical Data:** Energies, geometries, frequencies, orbitals, populations
171
+ **Use Cases:** QM calculations, geometry optimization, frequency analysis
172
+ **Python Libraries:**
173
+ - `cclib`: `cclib.io.ccread('file.log')`
174
+ - `GaussianRunPack`: For Gaussian workflows
175
+ - Custom parsers with regex
176
+ **EDA Approach:**
177
+ - Convergence analysis
178
+ - Energy profile extraction
179
+ - Vibrational frequency analysis
180
+ - Orbital energy levels
181
+ - Population analysis (Mulliken, NBO)
182
+ - Thermochemistry data extraction
183
+
184
+ ### .out - Quantum Chemistry Output
185
+ **Description:** Generic output file from various QM packages
186
+ **Typical Data:** Calculation results, energies, properties
187
+ **Use Cases:** QM calculations across different software
188
+ **Python Libraries:**
189
+ - `cclib`: Universal parser for QM outputs
190
+ - `ASE`: Can read some output formats
191
+ **EDA Approach:**
192
+ - Software-specific parsing
193
+ - Convergence criteria check
194
+ - Energy and gradient trends
195
+ - Basis set and method validation
196
+ - Computational cost analysis
197
+
198
+ ### .wfn / .wfx - Wavefunction Files
199
+ **Description:** Wavefunction data for quantum chemical analysis
200
+ **Typical Data:** Molecular orbitals, basis sets, density matrices
201
+ **Use Cases:** Electron density analysis, QTAIM analysis
202
+ **Python Libraries:**
203
+ - `Multiwfn`: Interface via Python
204
+ - `Horton`: For wavefunction analysis
205
+ - Custom parsers for specific formats
206
+ **EDA Approach:**
207
+ - Orbital population analysis
208
+ - Electron density distribution
209
+ - Critical point analysis (QTAIM)
210
+ - Molecular orbital visualization
211
+ - Bonding analysis
212
+
213
+ ### .fchk - Gaussian Formatted Checkpoint
214
+ **Description:** Formatted checkpoint file from Gaussian
215
+ **Typical Data:** Complete wavefunction data, results, geometry
216
+ **Use Cases:** Post-processing Gaussian calculations
217
+ **Python Libraries:**
218
+ - `cclib`: Can parse fchk files
219
+ - `GaussView` Python API (if available)
220
+ - Custom parsers
221
+ **EDA Approach:**
222
+ - Wavefunction quality assessment
223
+ - Property extraction
224
+ - Basis set information
225
+ - Gradient and Hessian analysis
226
+ - Natural orbital analysis
227
+
228
+ ### .cube - Gaussian Cube File
229
+ **Description:** Volumetric data on a 3D grid
230
+ **Typical Data:** Electron density, molecular orbitals, ESP on grid
231
+ **Use Cases:** Visualization of volumetric properties
232
+ **Python Libraries:**
233
+ - `cclib`: `cclib.io.ccread('file.cube')`
234
+ - `ase.io`: `ase.io.read('file.cube')`
235
+ - `pyquante`: For cube file manipulation
236
+ **EDA Approach:**
237
+ - Grid dimension and spacing analysis
238
+ - Value distribution statistics
239
+ - Isosurface value determination
240
+ - Integration over volume
241
+ - Comparison between different cubes
242
+
243
+ ## Molecular Dynamics Formats
244
+
245
+ ### .dcd - Binary Trajectory
246
+ **Description:** Binary trajectory format (CHARMM, NAMD)
247
+ **Typical Data:** Time series of atomic coordinates
248
+ **Use Cases:** MD trajectory analysis
249
+ **Python Libraries:**
250
+ - `MDAnalysis`: `Universe(topology, 'traj.dcd')`
251
+ - `MDTraj`: `mdtraj.load_dcd('traj.dcd', top='topology.pdb')`
252
+ - `PyTraj` (Amber): Limited support
253
+ **EDA Approach:**
254
+ - RMSD/RMSF analysis
255
+ - Trajectory length and frame count
256
+ - Coordinate range and drift
257
+ - Periodic boundary handling
258
+ - File integrity check
259
+ - Time step validation
260
+
261
+ ### .xtc - Compressed Trajectory
262
+ **Description:** GROMACS compressed trajectory format
263
+ **Typical Data:** Compressed coordinates from MD simulations
264
+ **Use Cases:** Space-efficient MD trajectory storage
265
+ **Python Libraries:**
266
+ - `MDAnalysis`: `Universe(topology, 'traj.xtc')`
267
+ - `MDTraj`: `mdtraj.load_xtc('traj.xtc', top='topology.pdb')`
268
+ **EDA Approach:**
269
+ - Compression ratio assessment
270
+ - Precision loss evaluation
271
+ - RMSD over time
272
+ - Structural stability metrics
273
+ - Sampling frequency analysis
274
+
275
+ ### .trr - GROMACS Trajectory
276
+ **Description:** Full precision GROMACS trajectory
277
+ **Typical Data:** Coordinates, velocities, forces from MD
278
+ **Use Cases:** High-precision MD analysis
279
+ **Python Libraries:**
280
+ - `MDAnalysis`: Full support
281
+ - `MDTraj`: Can read trr files
282
+ - `GromacsWrapper`
283
+ **EDA Approach:**
284
+ - Full system dynamics analysis
285
+ - Energy conservation check (with velocities)
286
+ - Force analysis
287
+ - Temperature and pressure validation
288
+ - System equilibration assessment
289
+
290
+ ### .nc / .netcdf - Amber NetCDF Trajectory
291
+ **Description:** Network Common Data Form trajectory
292
+ **Typical Data:** MD coordinates, velocities, forces
293
+ **Use Cases:** Amber MD simulations, large trajectory storage
294
+ **Python Libraries:**
295
+ - `MDAnalysis`: NetCDF support
296
+ - `PyTraj`: Native Amber analysis
297
+ - `netCDF4`: Low-level access
298
+ **EDA Approach:**
299
+ - Metadata extraction
300
+ - Trajectory statistics
301
+ - Time series analysis
302
+ - Replica exchange analysis
303
+ - Multi-dimensional data extraction
304
+
305
+ ### .top - GROMACS Topology
306
+ **Description:** Molecular topology for GROMACS
307
+ **Typical Data:** Atom types, bonds, angles, force field parameters
308
+ **Use Cases:** MD simulation setup and analysis
309
+ **Python Libraries:**
310
+ - `ParmEd`: `parmed.load_file('system.top')`
311
+ - `MDAnalysis`: Can parse topology
312
+ - Custom parsers for specific fields
313
+ **EDA Approach:**
314
+ - Force field parameter validation
315
+ - System composition
316
+ - Bond/angle/dihedral distribution
317
+ - Charge neutrality check
318
+ - Molecule type enumeration
319
+
320
+ ### .psf - Protein Structure File (CHARMM)
321
+ **Description:** Topology file for CHARMM/NAMD
322
+ **Typical Data:** Atom connectivity, types, charges
323
+ **Use Cases:** CHARMM/NAMD MD simulations
324
+ **Python Libraries:**
325
+ - `MDAnalysis`: Native PSF support
326
+ - `ParmEd`: Can read PSF files
327
+ **EDA Approach:**
328
+ - Topology validation
329
+ - Connectivity analysis
330
+ - Charge distribution
331
+ - Atom type statistics
332
+ - Segment analysis
333
+
334
+ ### .prmtop - Amber Parameter/Topology
335
+ **Description:** Amber topology and parameter file
336
+ **Typical Data:** System topology, force field parameters
337
+ **Use Cases:** Amber MD simulations
338
+ **Python Libraries:**
339
+ - `ParmEd`: `parmed.load_file('system.prmtop')`
340
+ - `PyTraj`: Native Amber support
341
+ **EDA Approach:**
342
+ - Force field completeness
343
+ - Parameter validation
344
+ - System size and composition
345
+ - Periodic box information
346
+ - Atom mask creation for analysis
347
+
348
+ ### .inpcrd / .rst7 - Amber Coordinates
349
+ **Description:** Amber coordinate/restart file
350
+ **Typical Data:** Atomic coordinates, velocities, box info
351
+ **Use Cases:** Starting coordinates for Amber MD
352
+ **Python Libraries:**
353
+ - `ParmEd`: Works with prmtop
354
+ - `PyTraj`: Amber coordinate reading
355
+ **EDA Approach:**
356
+ - Coordinate validity
357
+ - System initialization check
358
+ - Box vector validation
359
+ - Velocity distribution (if restart)
360
+ - Energy minimization status
361
+
362
+ ## Spectroscopy and Analytical Data
363
+
364
+ ### .jcamp / .jdx - JCAMP-DX
365
+ **Description:** Joint Committee on Atomic and Molecular Physical Data eXchange
366
+ **Typical Data:** Spectroscopic data (IR, NMR, MS, UV-Vis)
367
+ **Use Cases:** Spectroscopy data exchange and archiving
368
+ **Python Libraries:**
369
+ - `jcamp`: `jcamp.jcamp_reader('file.jdx')`
370
+ - `nmrglue`: For NMR JCAMP files
371
+ - Custom parsers for specific subtypes
372
+ **EDA Approach:**
373
+ - Peak detection and analysis
374
+ - Baseline correction assessment
375
+ - Signal-to-noise calculation
376
+ - Spectral range validation
377
+ - Integration analysis
378
+ - Comparison with reference spectra
379
+
380
+ ### .mzML - Mass Spectrometry Markup Language
381
+ **Description:** Standard XML format for mass spectrometry data
382
+ **Typical Data:** MS/MS spectra, chromatograms, metadata
383
+ **Use Cases:** Proteomics, metabolomics, mass spectrometry workflows
384
+ **Python Libraries:**
385
+ - `pymzml`: `pymzml.run.Reader('file.mzML')`
386
+ - `pyteomics`: `pyteomics.mzml.read('file.mzML')`
387
+ - `MSFileReader` wrappers
388
+ **EDA Approach:**
389
+ - Scan count and types
390
+ - MS level distribution
391
+ - Retention time range
392
+ - m/z range and resolution
393
+ - Peak intensity distribution
394
+ - Data completeness
395
+ - Quality control metrics
396
+
397
+ ### .mzXML - Mass Spectrometry XML
398
+ **Description:** Open XML format for MS data
399
+ **Typical Data:** Mass spectra, retention times, peak lists
400
+ **Use Cases:** Legacy MS data, metabolomics
401
+ **Python Libraries:**
402
+ - `pymzml`: Can read mzXML
403
+ - `pyteomics.mzxml`
404
+ - `lxml` for direct XML parsing
405
+ **EDA Approach:**
406
+ - Similar to mzML
407
+ - Version compatibility check
408
+ - Conversion quality assessment
409
+ - Peak picking validation
410
+
411
+ ### .raw - Vendor Raw Data
412
+ **Description:** Proprietary instrument data files (Thermo, Bruker, etc.)
413
+ **Typical Data:** Raw instrument signals, unprocessed data
414
+ **Use Cases:** Direct instrument data access
415
+ **Python Libraries:**
416
+ - `pymsfilereader`: For Thermo RAW files
417
+ - `ThermoRawFileParser`: CLI wrapper
418
+ - Vendor-specific APIs (Thermo, Bruker Compass)
419
+ **EDA Approach:**
420
+ - Instrument method extraction
421
+ - Raw signal quality
422
+ - Calibration status
423
+ - Scan function analysis
424
+ - Chromatographic quality metrics
425
+
426
+ ### .d - Agilent Data Directory
427
+ **Description:** Agilent's data folder structure
428
+ **Typical Data:** LC-MS, GC-MS data and metadata
429
+ **Use Cases:** Agilent instrument data processing
430
+ **Python Libraries:**
431
+ - `agilent-reader`: Community tools
432
+ - `Chemstation` Python integration
433
+ - Custom directory parsing
434
+ **EDA Approach:**
435
+ - Directory structure validation
436
+ - Method parameter extraction
437
+ - Signal file integrity
438
+ - Calibration curve analysis
439
+ - Sequence information extraction
440
+
441
+ ### .fid - NMR Free Induction Decay
442
+ **Description:** Raw NMR time-domain data
443
+ **Typical Data:** Time-domain NMR signal
444
+ **Use Cases:** NMR processing and analysis
445
+ **Python Libraries:**
446
+ - `nmrglue`: `nmrglue.bruker.read_fid('fid')`
447
+ - `nmrstarlib`: For NMR-STAR files
448
+ **EDA Approach:**
449
+ - Signal decay analysis
450
+ - Noise level assessment
451
+ - Acquisition parameter validation
452
+ - Apodization function selection
453
+ - Zero-filling optimization
454
+ - Phasing parameter estimation
455
+
456
+ ### .ft - NMR Frequency-Domain Data
457
+ **Description:** Processed NMR spectrum
458
+ **Typical Data:** Frequency-domain NMR data
459
+ **Use Cases:** NMR analysis and interpretation
460
+ **Python Libraries:**
461
+ - `nmrglue`: Comprehensive NMR support
462
+ - `pyNMR`: For processing
463
+ **EDA Approach:**
464
+ - Peak picking and integration
465
+ - Chemical shift calibration
466
+ - Multiplicity analysis
467
+ - Coupling constant extraction
468
+ - Spectral quality metrics
469
+ - Reference compound identification
470
+
471
+ ### .spc - Spectroscopy File
472
+ **Description:** Thermo Galactic spectroscopy format
473
+ **Typical Data:** IR, Raman, UV-Vis spectra
474
+ **Use Cases:** Spectroscopic data from various instruments
475
+ **Python Libraries:**
476
+ - `spc`: `spc.File('file.spc')`
477
+ - Custom parsers for binary format
478
+ **EDA Approach:**
479
+ - Spectral resolution
480
+ - Wavelength/wavenumber range
481
+ - Baseline characterization
482
+ - Peak identification
483
+ - Derivative spectra calculation
484
+
485
+ ## Chemical Database Formats
486
+
487
+ ### .inchi - International Chemical Identifier
488
+ **Description:** Text identifier for chemical substances
489
+ **Typical Data:** Layered chemical structure representation
490
+ **Use Cases:** Chemical database keys, structure searching
491
+ **Python Libraries:**
492
+ - `RDKit`: `Chem.MolFromInchi(inchi)`
493
+ - `Open Babel`: InChI conversion
494
+ **EDA Approach:**
495
+ - InChI validation
496
+ - Layer analysis
497
+ - Stereochemistry verification
498
+ - InChI key generation
499
+ - Structure round-trip validation
500
+
501
+ ### .cdx / .cdxml - ChemDraw Exchange
502
+ **Description:** ChemDraw drawing file format
503
+ **Typical Data:** 2D chemical structures with annotations
504
+ **Use Cases:** Chemical drawing, publication figures
505
+ **Python Libraries:**
506
+ - `RDKit`: Can import some CDXML
507
+ - `Open Babel`: Limited support
508
+ - `ChemDraw` Python API (commercial)
509
+ **EDA Approach:**
510
+ - Structure extraction
511
+ - Annotation preservation
512
+ - Style consistency
513
+ - 2D coordinate validation
514
+
515
+ ### .cml - Chemical Markup Language
516
+ **Description:** XML-based chemical structure format
517
+ **Typical Data:** Chemical structures, reactions, properties
518
+ **Use Cases:** Semantic chemical data representation
519
+ **Python Libraries:**
520
+ - `RDKit`: CML support
521
+ - `Open Babel`: Good CML support
522
+ - `lxml`: For XML parsing
523
+ **EDA Approach:**
524
+ - XML schema validation
525
+ - Namespace handling
526
+ - Property extraction
527
+ - Reaction scheme analysis
528
+ - Metadata completeness
529
+
530
+ ### .rxn - MDL Reaction File
531
+ **Description:** Chemical reaction structure file
532
+ **Typical Data:** Reactants, products, reaction arrows
533
+ **Use Cases:** Reaction databases, synthesis planning
534
+ **Python Libraries:**
535
+ - `RDKit`: `Chem.ReactionFromRxnFile('file.rxn')`
536
+ - `Open Babel`: Reaction support
537
+ **EDA Approach:**
538
+ - Reaction balancing validation
539
+ - Atom mapping analysis
540
+ - Reagent identification
541
+ - Stereochemistry changes
542
+ - Reaction classification
543
+
544
+ ### .rdf - Reaction Data File
545
+ **Description:** Multi-reaction file format
546
+ **Typical Data:** Multiple reactions with data
547
+ **Use Cases:** Reaction databases
548
+ **Python Libraries:**
549
+ - `RDKit`: RDF reading capabilities
550
+ - Custom parsers
551
+ **EDA Approach:**
552
+ - Reaction yield statistics
553
+ - Condition analysis
554
+ - Success rate patterns
555
+ - Reagent frequency analysis
556
+
557
+ ## Computational Output and Data
558
+
559
+ ### .hdf5 / .h5 - Hierarchical Data Format
560
+ **Description:** Container for scientific data arrays
561
+ **Typical Data:** Large arrays, metadata, hierarchical organization
562
+ **Use Cases:** Large dataset storage, computational results
563
+ **Python Libraries:**
564
+ - `h5py`: `h5py.File('file.h5', 'r')`
565
+ - `pytables`: Advanced HDF5 interface
566
+ - `pandas`: Can read HDF5
567
+ **EDA Approach:**
568
+ - Dataset structure exploration
569
+ - Array shape and dtype analysis
570
+ - Metadata extraction
571
+ - Memory-efficient data sampling
572
+ - Chunk optimization analysis
573
+ - Compression ratio assessment
574
+
575
+ ### .pkl / .pickle - Python Pickle
576
+ **Description:** Serialized Python objects
577
+ **Typical Data:** Any Python object (molecules, dataframes, models)
578
+ **Use Cases:** Intermediate data storage, model persistence
579
+ **Python Libraries:**
580
+ - `pickle`: Built-in serialization
581
+ - `joblib`: Enhanced pickling for large arrays
582
+ - `dill`: Extended pickle support
583
+ **EDA Approach:**
584
+ - Object type inspection
585
+ - Size and complexity analysis
586
+ - Version compatibility check
587
+ - Security validation (trusted source)
588
+ - Deserialization testing
589
+
590
+ ### .npy / .npz - NumPy Arrays
591
+ **Description:** NumPy array binary format
592
+ **Typical Data:** Numerical arrays (coordinates, features, matrices)
593
+ **Use Cases:** Fast numerical data I/O
594
+ **Python Libraries:**
595
+ - `numpy`: `np.load('file.npy')`
596
+ - Direct memory mapping for large files
597
+ **EDA Approach:**
598
+ - Array shape and dimensions
599
+ - Data type and precision
600
+ - Statistical summary (mean, std, range)
601
+ - Missing value detection
602
+ - Outlier identification
603
+ - Memory footprint analysis
604
+
605
+ ### .mat - MATLAB Data File
606
+ **Description:** MATLAB workspace data
607
+ **Typical Data:** Arrays, structures from MATLAB
608
+ **Use Cases:** MATLAB-Python data exchange
609
+ **Python Libraries:**
610
+ - `scipy.io`: `scipy.io.loadmat('file.mat')`
611
+ - `h5py`: For v7.3 MAT files
612
+ **EDA Approach:**
613
+ - Variable extraction and types
614
+ - Array dimension analysis
615
+ - Structure field exploration
616
+ - MATLAB version compatibility
617
+ - Data type conversion validation
618
+
619
+ ### .csv - Comma-Separated Values
620
+ **Description:** Tabular data in text format
621
+ **Typical Data:** Chemical properties, experimental data, descriptors
622
+ **Use Cases:** Data exchange, analysis, machine learning
623
+ **Python Libraries:**
624
+ - `pandas`: `pd.read_csv('file.csv')`
625
+ - `csv`: Built-in module
626
+ - `polars`: Fast CSV reading
627
+ **EDA Approach:**
628
+ - Data types inference
629
+ - Missing value patterns
630
+ - Statistical summaries
631
+ - Correlation analysis
632
+ - Distribution visualization
633
+ - Outlier detection
634
+
635
+ ### .json - JavaScript Object Notation
636
+ **Description:** Structured text data format
637
+ **Typical Data:** Chemical properties, metadata, API responses
638
+ **Use Cases:** Data interchange, configuration, web APIs
639
+ **Python Libraries:**
640
+ - `json`: Built-in JSON support
641
+ - `pandas`: `pd.read_json()`
642
+ - `ujson`: Faster JSON parsing
643
+ **EDA Approach:**
644
+ - Schema validation
645
+ - Nesting depth analysis
646
+ - Key-value distribution
647
+ - Data type consistency
648
+ - Array length statistics
649
+
650
+ ### .parquet - Apache Parquet
651
+ **Description:** Columnar storage format
652
+ **Typical Data:** Large tabular datasets efficiently
653
+ **Use Cases:** Big data, efficient columnar analytics
654
+ **Python Libraries:**
655
+ - `pandas`: `pd.read_parquet('file.parquet')`
656
+ - `pyarrow`: Direct parquet access
657
+ - `fastparquet`: Alternative implementation
658
+ **EDA Approach:**
659
+ - Column statistics from metadata
660
+ - Partition analysis
661
+ - Compression efficiency
662
+ - Row group structure
663
+ - Fast sampling for large files
664
+ - Schema evolution tracking
.scider/skills/exploratory-data-analysis/references/general_scientific_formats.md ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # General Scientific Data Formats Reference
2
+
3
+ This reference covers general-purpose scientific data formats used across multiple disciplines.
4
+
5
+ ## Numerical and Array Data
6
+
7
+ ### .npy - NumPy Array
8
+ **Description:** Binary NumPy array format
9
+ **Typical Data:** N-dimensional arrays of any data type
10
+ **Use Cases:** Fast I/O for numerical data, intermediate results
11
+ **Python Libraries:**
12
+ - `numpy`: `np.load('file.npy')`, `np.save()`
13
+ - Memory-mapped access: `np.load('file.npy', mmap_mode='r')`
14
+ **EDA Approach:**
15
+ - Array shape and dimensionality
16
+ - Data type and precision
17
+ - Statistical summary (mean, std, min, max, percentiles)
18
+ - Missing or invalid values (NaN, inf)
19
+ - Memory footprint
20
+ - Value distribution and histogram
21
+ - Sparsity analysis
22
+ - Correlation structure (if 2D)
23
+
24
+ ### .npz - Compressed NumPy Archive
25
+ **Description:** Multiple NumPy arrays in one file
26
+ **Typical Data:** Collections of related arrays
27
+ **Use Cases:** Saving multiple arrays together, compressed storage
28
+ **Python Libraries:**
29
+ - `numpy`: `np.load('file.npz')` returns dict-like object
30
+ - `np.savez()` or `np.savez_compressed()`
31
+ **EDA Approach:**
32
+ - List of contained arrays
33
+ - Individual array analysis
34
+ - Relationships between arrays
35
+ - Total file size and compression ratio
36
+ - Naming conventions
37
+ - Data consistency checks
38
+
39
+ ### .csv - Comma-Separated Values
40
+ **Description:** Plain text tabular data
41
+ **Typical Data:** Experimental measurements, results tables
42
+ **Use Cases:** Universal data exchange, spreadsheet export
43
+ **Python Libraries:**
44
+ - `pandas`: `pd.read_csv('file.csv')`
45
+ - `csv`: Built-in module
46
+ - `polars`: High-performance CSV reading
47
+ - `numpy`: `np.loadtxt()` or `np.genfromtxt()`
48
+ **EDA Approach:**
49
+ - Row and column counts
50
+ - Data type inference
51
+ - Missing value patterns and frequency
52
+ - Column statistics (numeric: mean, std; categorical: frequencies)
53
+ - Outlier detection
54
+ - Correlation matrix
55
+ - Duplicate row detection
56
+ - Header and index validation
57
+ - Encoding issues detection
58
+
59
+ ### .tsv / .tab - Tab-Separated Values
60
+ **Description:** Tab-delimited tabular data
61
+ **Typical Data:** Similar to CSV but tab-separated
62
+ **Use Cases:** Bioinformatics, text processing output
63
+ **Python Libraries:**
64
+ - `pandas`: `pd.read_csv('file.tsv', sep='\t')`
65
+ **EDA Approach:**
66
+ - Same as CSV format
67
+ - Tab vs space validation
68
+ - Quote handling
69
+
70
+ ### .xlsx / .xls - Excel Spreadsheets
71
+ **Description:** Microsoft Excel binary/XML formats
72
+ **Typical Data:** Tabular data with formatting, formulas
73
+ **Use Cases:** Lab notebooks, data entry, reports
74
+ **Python Libraries:**
75
+ - `pandas`: `pd.read_excel('file.xlsx')`
76
+ - `openpyxl`: Full Excel file manipulation
77
+ - `xlrd`: Reading .xls (legacy)
78
+ **EDA Approach:**
79
+ - Sheet enumeration and names
80
+ - Per-sheet data analysis
81
+ - Formula evaluation
82
+ - Merged cells handling
83
+ - Hidden rows/columns
84
+ - Data validation rules
85
+ - Named ranges
86
+ - Formatting-only cells detection
87
+
88
+ ### .json - JavaScript Object Notation
89
+ **Description:** Hierarchical text data format
90
+ **Typical Data:** Nested data structures, metadata
91
+ **Use Cases:** API responses, configuration, results
92
+ **Python Libraries:**
93
+ - `json`: Built-in module
94
+ - `pandas`: `pd.read_json()`
95
+ - `ujson`: Faster JSON parsing
96
+ **EDA Approach:**
97
+ - Schema inference
98
+ - Nesting depth
99
+ - Key-value distribution
100
+ - Array lengths
101
+ - Data type consistency
102
+ - Missing keys
103
+ - Duplicate detection
104
+ - Size and complexity metrics
105
+
106
+ ### .xml - Extensible Markup Language
107
+ **Description:** Hierarchical markup format
108
+ **Typical Data:** Structured data with metadata
109
+ **Use Cases:** Standards-based data exchange, APIs
110
+ **Python Libraries:**
111
+ - `lxml`: `lxml.etree.parse()`
112
+ - `xml.etree.ElementTree`: Built-in XML
113
+ - `xmltodict`: Convert XML to dict
114
+ **EDA Approach:**
115
+ - Schema/DTD validation
116
+ - Element hierarchy and depth
117
+ - Namespace handling
118
+ - Attribute vs element content
119
+ - CDATA sections
120
+ - Text content extraction
121
+ - Sibling and child counts
122
+
123
+ ### .yaml / .yml - YAML
124
+ **Description:** Human-readable data serialization
125
+ **Typical Data:** Configuration, metadata, parameters
126
+ **Use Cases:** Experiment configurations, pipelines
127
+ **Python Libraries:**
128
+ - `yaml`: `yaml.safe_load()` or `yaml.load()`
129
+ - `ruamel.yaml`: YAML 1.2 support
130
+ **EDA Approach:**
131
+ - Configuration structure
132
+ - Data type handling
133
+ - List and dict depth
134
+ - Anchor and alias usage
135
+ - Multi-document files
136
+ - Comments preservation
137
+ - Validation against schema
138
+
139
+ ### .toml - TOML Configuration
140
+ **Description:** Configuration file format
141
+ **Typical Data:** Settings, parameters
142
+ **Use Cases:** Python package configuration, settings
143
+ **Python Libraries:**
144
+ - `tomli` / `tomllib`: TOML reading (tomllib in Python 3.11+)
145
+ - `toml`: Reading and writing
146
+ **EDA Approach:**
147
+ - Section structure
148
+ - Key-value pairs
149
+ - Data type inference
150
+ - Nested table validation
151
+ - Required vs optional fields
152
+
153
+ ### .ini - INI Configuration
154
+ **Description:** Simple configuration format
155
+ **Typical Data:** Application settings
156
+ **Use Cases:** Legacy configurations, simple settings
157
+ **Python Libraries:**
158
+ - `configparser`: Built-in INI parser
159
+ **EDA Approach:**
160
+ - Section enumeration
161
+ - Key-value extraction
162
+ - Type conversion
163
+ - Comment handling
164
+ - Case sensitivity
165
+
166
+ ## Binary and Compressed Data
167
+
168
+ ### .hdf5 / .h5 - Hierarchical Data Format 5
169
+ **Description:** Container for large scientific datasets
170
+ **Typical Data:** Multi-dimensional arrays, metadata, groups
171
+ **Use Cases:** Large datasets, multi-modal data, parallel I/O
172
+ **Python Libraries:**
173
+ - `h5py`: `h5py.File('file.h5', 'r')`
174
+ - `pytables`: Advanced HDF5 interface
175
+ - `pandas`: HDF5 storage via HDFStore
176
+ **EDA Approach:**
177
+ - Group and dataset hierarchy
178
+ - Dataset shapes and dtypes
179
+ - Attributes and metadata
180
+ - Compression and chunking strategy
181
+ - Memory-efficient sampling
182
+ - Dataset relationships
183
+ - File size and efficiency
184
+ - Access patterns optimization
185
+
186
+ ### .zarr - Chunked Array Storage
187
+ **Description:** Cloud-optimized chunked arrays
188
+ **Typical Data:** Large N-dimensional arrays
189
+ **Use Cases:** Cloud storage, parallel computing, streaming
190
+ **Python Libraries:**
191
+ - `zarr`: `zarr.open('file.zarr')`
192
+ - `xarray`: Zarr backend support
193
+ **EDA Approach:**
194
+ - Array metadata and dimensions
195
+ - Chunk size optimization
196
+ - Compression codec and ratio
197
+ - Synchronizer and store type
198
+ - Multi-scale hierarchies
199
+ - Parallel access performance
200
+ - Attribute metadata
201
+
202
+ ### .gz / .gzip - Gzip Compressed
203
+ **Description:** Compressed data files
204
+ **Typical Data:** Any compressed text or binary
205
+ **Use Cases:** Compression for storage/transfer
206
+ **Python Libraries:**
207
+ - `gzip`: Built-in gzip module
208
+ - `pandas`: Automatic gzip handling in read functions
209
+ **EDA Approach:**
210
+ - Compression ratio
211
+ - Original file type detection
212
+ - Decompression validation
213
+ - Header information
214
+ - Multi-member archives
215
+
216
+ ### .bz2 - Bzip2 Compressed
217
+ **Description:** Bzip2 compression
218
+ **Typical Data:** Highly compressed files
219
+ **Use Cases:** Better compression than gzip
220
+ **Python Libraries:**
221
+ - `bz2`: Built-in bz2 module
222
+ - Automatic handling in pandas
223
+ **EDA Approach:**
224
+ - Compression efficiency
225
+ - Decompression time
226
+ - Content validation
227
+
228
+ ### .zip - ZIP Archive
229
+ **Description:** Archive with multiple files
230
+ **Typical Data:** Collections of files
231
+ **Use Cases:** File distribution, archiving
232
+ **Python Libraries:**
233
+ - `zipfile`: Built-in ZIP support
234
+ - `pandas`: Can read zipped CSVs
235
+ **EDA Approach:**
236
+ - Archive member listing
237
+ - Compression method per file
238
+ - Total vs compressed size
239
+ - Directory structure
240
+ - File type distribution
241
+ - Extraction validation
242
+
243
+ ### .tar / .tar.gz - TAR Archive
244
+ **Description:** Unix tape archive
245
+ **Typical Data:** Multiple files and directories
246
+ **Use Cases:** Software distribution, backups
247
+ **Python Libraries:**
248
+ - `tarfile`: Built-in TAR support
249
+ **EDA Approach:**
250
+ - Member file listing
251
+ - Compression (if .tar.gz, .tar.bz2)
252
+ - Directory structure
253
+ - Permissions preservation
254
+ - Extraction testing
255
+
256
+ ## Time Series and Waveform Data
257
+
258
+ ### .wav - Waveform Audio
259
+ **Description:** Audio waveform data
260
+ **Typical Data:** Acoustic signals, audio recordings
261
+ **Use Cases:** Acoustic analysis, ultrasound, signal processing
262
+ **Python Libraries:**
263
+ - `scipy.io.wavfile`: `scipy.io.wavfile.read()`
264
+ - `wave`: Built-in module
265
+ - `soundfile`: Enhanced audio I/O
266
+ **EDA Approach:**
267
+ - Sample rate and duration
268
+ - Bit depth and channels
269
+ - Amplitude distribution
270
+ - Spectral analysis (FFT)
271
+ - Signal-to-noise ratio
272
+ - Clipping detection
273
+ - Frequency content
274
+
275
+ ### .mat - MATLAB Data
276
+ **Description:** MATLAB workspace variables
277
+ **Typical Data:** Arrays, structures, cells
278
+ **Use Cases:** MATLAB-Python interoperability
279
+ **Python Libraries:**
280
+ - `scipy.io`: `scipy.io.loadmat()`
281
+ - `h5py`: For MATLAB v7.3 files (HDF5-based)
282
+ - `mat73`: Pure Python for v7.3
283
+ **EDA Approach:**
284
+ - Variable names and types
285
+ - Array dimensions
286
+ - Structure field exploration
287
+ - Cell array handling
288
+ - Sparse matrix detection
289
+ - MATLAB version compatibility
290
+ - Metadata extraction
291
+
292
+ ### .edf - European Data Format
293
+ **Description:** Time series data (especially medical)
294
+ **Typical Data:** EEG, physiological signals
295
+ **Use Cases:** Medical signal storage
296
+ **Python Libraries:**
297
+ - `pyedflib`: EDF/EDF+ reading and writing
298
+ - `mne`: Neurophysiology data (supports EDF)
299
+ **EDA Approach:**
300
+ - Signal count and names
301
+ - Sampling frequencies
302
+ - Signal ranges and units
303
+ - Recording duration
304
+ - Annotation events
305
+ - Data quality (saturation, noise)
306
+ - Patient/study information
307
+
308
+ ### .csv (Time Series)
309
+ **Description:** CSV with timestamp column
310
+ **Typical Data:** Time-indexed measurements
311
+ **Use Cases:** Sensor data, monitoring, experiments
312
+ **Python Libraries:**
313
+ - `pandas`: `pd.read_csv()` with `parse_dates`
314
+ **EDA Approach:**
315
+ - Temporal range and resolution
316
+ - Sampling regularity
317
+ - Missing time points
318
+ - Trend and seasonality
319
+ - Stationarity tests
320
+ - Autocorrelation
321
+ - Anomaly detection
322
+
323
+ ## Geospatial and Environmental Data
324
+
325
+ ### .shp - Shapefile
326
+ **Description:** Geospatial vector data
327
+ **Typical Data:** Geographic features (points, lines, polygons)
328
+ **Use Cases:** GIS analysis, spatial data
329
+ **Python Libraries:**
330
+ - `geopandas`: `gpd.read_file('file.shp')`
331
+ - `fiona`: Lower-level shapefile access
332
+ - `pyshp`: Pure Python shapefile reader
333
+ **EDA Approach:**
334
+ - Geometry type and count
335
+ - Coordinate reference system
336
+ - Bounding box
337
+ - Attribute table analysis
338
+ - Geometry validity
339
+ - Spatial distribution
340
+ - Multi-part features
341
+ - Associated files (.shx, .dbf, .prj)
342
+
343
+ ### .geojson - GeoJSON
344
+ **Description:** JSON format for geographic data
345
+ **Typical Data:** Features with geometry and properties
346
+ **Use Cases:** Web mapping, spatial analysis
347
+ **Python Libraries:**
348
+ - `geopandas`: Native GeoJSON support
349
+ - `json`: Parse as JSON then process
350
+ **EDA Approach:**
351
+ - Feature count and types
352
+ - CRS specification
353
+ - Bounding box calculation
354
+ - Property schema
355
+ - Geometry complexity
356
+ - Nesting structure
357
+
358
+ ### .tif / .tiff (Geospatial)
359
+ **Description:** GeoTIFF with spatial reference
360
+ **Typical Data:** Satellite imagery, DEMs, rasters
361
+ **Use Cases:** Remote sensing, terrain analysis
362
+ **Python Libraries:**
363
+ - `rasterio`: `rasterio.open('file.tif')`
364
+ - `gdal`: Geospatial Data Abstraction Library
365
+ - `xarray` with `rioxarray`: N-D geospatial arrays
366
+ **EDA Approach:**
367
+ - Raster dimensions and resolution
368
+ - Band count and descriptions
369
+ - Coordinate reference system
370
+ - Geotransform parameters
371
+ - NoData value handling
372
+ - Pixel value distribution
373
+ - Histogram analysis
374
+ - Overviews and pyramids
375
+
376
+ ### .nc / .netcdf - Network Common Data Form
377
+ **Description:** Self-describing array-based data
378
+ **Typical Data:** Climate, atmospheric, oceanographic data
379
+ **Use Cases:** Scientific datasets, model output
380
+ **Python Libraries:**
381
+ - `netCDF4`: `netCDF4.Dataset('file.nc')`
382
+ - `xarray`: `xr.open_dataset('file.nc')`
383
+ **EDA Approach:**
384
+ - Variable enumeration
385
+ - Dimension analysis
386
+ - Time series properties
387
+ - Spatial coverage
388
+ - Attribute metadata (CF conventions)
389
+ - Coordinate systems
390
+ - Chunking and compression
391
+ - Data quality flags
392
+
393
+ ### .grib / .grib2 - Gridded Binary
394
+ **Description:** Meteorological data format
395
+ **Typical Data:** Weather forecasts, climate data
396
+ **Use Cases:** Numerical weather prediction
397
+ **Python Libraries:**
398
+ - `pygrib`: GRIB file reading
399
+ - `xarray` with `cfgrib`: GRIB to xarray
400
+ **EDA Approach:**
401
+ - Message inventory
402
+ - Parameter and level types
403
+ - Spatial grid specification
404
+ - Temporal coverage
405
+ - Ensemble members
406
+ - Forecast vs analysis
407
+ - Data packing and precision
408
+
409
+ ### .hdf4 - HDF4 Format
410
+ **Description:** Older HDF format
411
+ **Typical Data:** NASA Earth Science data
412
+ **Use Cases:** Satellite data (MODIS, etc.)
413
+ **Python Libraries:**
414
+ - `pyhdf`: HDF4 access
415
+ - `gdal`: Can read HDF4
416
+ **EDA Approach:**
417
+ - Scientific dataset listing
418
+ - Vdata and attributes
419
+ - Dimension scales
420
+ - Metadata extraction
421
+ - Quality flags
422
+ - Conversion to HDF5 or NetCDF
423
+
424
+ ## Specialized Scientific Formats
425
+
426
+ ### .fits - Flexible Image Transport System
427
+ **Description:** Astronomy data format
428
+ **Typical Data:** Images, tables, spectra from telescopes
429
+ **Use Cases:** Astronomical observations
430
+ **Python Libraries:**
431
+ - `astropy.io.fits`: `fits.open('file.fits')`
432
+ - `fitsio`: Alternative FITS library
433
+ **EDA Approach:**
434
+ - HDU (Header Data Unit) structure
435
+ - Image dimensions and WCS
436
+ - Header keyword analysis
437
+ - Table column descriptions
438
+ - Data type and scaling
439
+ - FITS convention compliance
440
+ - Checksum validation
441
+
442
+ ### .asdf - Advanced Scientific Data Format
443
+ **Description:** Next-gen data format for astronomy
444
+ **Typical Data:** Complex hierarchical scientific data
445
+ **Use Cases:** James Webb Space Telescope data
446
+ **Python Libraries:**
447
+ - `asdf`: `asdf.open('file.asdf')`
448
+ **EDA Approach:**
449
+ - Tree structure exploration
450
+ - Schema validation
451
+ - Internal vs external arrays
452
+ - Compression methods
453
+ - YAML metadata
454
+ - Version compatibility
455
+
456
+ ### .root - ROOT Data Format
457
+ **Description:** CERN ROOT framework format
458
+ **Typical Data:** High-energy physics data
459
+ **Use Cases:** Particle physics experiments
460
+ **Python Libraries:**
461
+ - `uproot`: Pure Python ROOT reading
462
+ - `ROOT`: Official PyROOT bindings
463
+ **EDA Approach:**
464
+ - TTree structure
465
+ - Branch types and entries
466
+ - Histogram inventory
467
+ - Event loop statistics
468
+ - File compression
469
+ - Split level analysis
470
+
471
+ ### .txt - Plain Text Data
472
+ **Description:** Generic text-based data
473
+ **Typical Data:** Tab/space-delimited, custom formats
474
+ **Use Cases:** Simple data exchange, logs
475
+ **Python Libraries:**
476
+ - `pandas`: `pd.read_csv()` with custom delimiters
477
+ - `numpy`: `np.loadtxt()`, `np.genfromtxt()`
478
+ - Built-in file reading
479
+ **EDA Approach:**
480
+ - Format detection (delimiter, header)
481
+ - Data type inference
482
+ - Comment line handling
483
+ - Missing value codes
484
+ - Column alignment
485
+ - Encoding detection
486
+
487
+ ### .dat - Generic Data File
488
+ **Description:** Binary or text data
489
+ **Typical Data:** Instrument output, custom formats
490
+ **Use Cases:** Various scientific instruments
491
+ **Python Libraries:**
492
+ - Format-specific: requires knowledge of structure
493
+ - `numpy`: `np.fromfile()` for binary
494
+ - `struct`: Parse binary structures
495
+ **EDA Approach:**
496
+ - Binary vs text determination
497
+ - Header detection
498
+ - Record structure inference
499
+ - Endianness
500
+ - Data type patterns
501
+ - Validation with documentation
502
+
503
+ ### .log - Log Files
504
+ **Description:** Text logs from software/instruments
505
+ **Typical Data:** Timestamped events, messages
506
+ **Use Cases:** Troubleshooting, experiment tracking
507
+ **Python Libraries:**
508
+ - Built-in file reading
509
+ - `pandas`: Structured log parsing
510
+ - Regular expressions for parsing
511
+ **EDA Approach:**
512
+ - Log level distribution
513
+ - Timestamp parsing
514
+ - Error and warning frequency
515
+ - Event sequencing
516
+ - Pattern recognition
517
+ - Anomaly detection
518
+ - Session boundaries
.scider/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Microscopy and Imaging File Formats Reference
2
+
3
+ This reference covers file formats used in microscopy, medical imaging, remote sensing, and scientific image analysis.
4
+
5
+ ## Microscopy-Specific Formats
6
+
7
+ ### .tif / .tiff - Tagged Image File Format
8
+ **Description:** Flexible image format supporting multiple pages and metadata
9
+ **Typical Data:** Microscopy images, z-stacks, time series, multi-channel
10
+ **Use Cases:** Fluorescence microscopy, confocal imaging, biological imaging
11
+ **Python Libraries:**
12
+ - `tifffile`: `tifffile.imread('file.tif')` - Microscopy TIFF support
13
+ - `PIL/Pillow`: `Image.open('file.tif')` - Basic TIFF
14
+ - `scikit-image`: `io.imread('file.tif')`
15
+ - `AICSImageIO`: Multi-format microscopy reader
16
+ **EDA Approach:**
17
+ - Image dimensions and bit depth
18
+ - Multi-page/z-stack analysis
19
+ - Metadata extraction (OME-TIFF)
20
+ - Channel analysis and intensity distributions
21
+ - Temporal dynamics (time-lapse)
22
+ - Pixel size and spatial calibration
23
+ - Histogram analysis per channel
24
+ - Dynamic range utilization
25
+
26
+ ### .nd2 - Nikon NIS-Elements
27
+ **Description:** Proprietary Nikon microscope format
28
+ **Typical Data:** Multi-dimensional microscopy (XYZCT)
29
+ **Use Cases:** Nikon microscope data, confocal, widefield
30
+ **Python Libraries:**
31
+ - `nd2reader`: `ND2Reader('file.nd2')`
32
+ - `pims`: `pims.ND2_Reader('file.nd2')`
33
+ - `AICSImageIO`: Universal reader
34
+ **EDA Approach:**
35
+ - Experiment metadata extraction
36
+ - Channel configurations
37
+ - Time-lapse frame analysis
38
+ - Z-stack depth and spacing
39
+ - XY stage positions
40
+ - Laser settings and power
41
+ - Pixel binning information
42
+ - Acquisition timestamps
43
+
44
+ ### .lif - Leica Image Format
45
+ **Description:** Leica microscope proprietary format
46
+ **Typical Data:** Multi-experiment, multi-dimensional images
47
+ **Use Cases:** Leica confocal and widefield data
48
+ **Python Libraries:**
49
+ - `readlif`: `readlif.LifFile('file.lif')`
50
+ - `AICSImageIO`: LIF support
51
+ - `python-bioformats`: Via Bio-Formats
52
+ **EDA Approach:**
53
+ - Multiple experiment detection
54
+ - Image series enumeration
55
+ - Metadata per experiment
56
+ - Channel and timepoint structure
57
+ - Physical dimensions extraction
58
+ - Objective and detector information
59
+ - Scan settings analysis
60
+
61
+ ### .czi - Carl Zeiss Image
62
+ **Description:** Zeiss microscope format
63
+ **Typical Data:** Multi-dimensional microscopy with rich metadata
64
+ **Use Cases:** Zeiss confocal, lightsheet, widefield
65
+ **Python Libraries:**
66
+ - `czifile`: `czifile.CziFile('file.czi')`
67
+ - `AICSImageIO`: CZI support
68
+ - `pylibCZIrw`: Official Zeiss library
69
+ **EDA Approach:**
70
+ - Scene and position analysis
71
+ - Mosaic tile structure
72
+ - Channel wavelength information
73
+ - Acquisition mode detection
74
+ - Scaling and calibration
75
+ - Instrument configuration
76
+ - ROI definitions
77
+
78
+ ### .oib / .oif - Olympus Image Format
79
+ **Description:** Olympus microscope formats
80
+ **Typical Data:** Confocal and multiphoton imaging
81
+ **Use Cases:** Olympus FluoView data
82
+ **Python Libraries:**
83
+ - `AICSImageIO`: OIB/OIF support
84
+ - `python-bioformats`: Via Bio-Formats
85
+ **EDA Approach:**
86
+ - Directory structure validation (OIF)
87
+ - Metadata file parsing
88
+ - Channel configuration
89
+ - Scan parameters
90
+ - Objective and filter information
91
+ - PMT settings
92
+
93
+ ### .vsi - Olympus VSI
94
+ **Description:** Olympus slide scanner format
95
+ **Typical Data:** Whole slide imaging, large mosaics
96
+ **Use Cases:** Virtual microscopy, pathology
97
+ **Python Libraries:**
98
+ - `openslide-python`: `openslide.OpenSlide('file.vsi')`
99
+ - `AICSImageIO`: VSI support
100
+ **EDA Approach:**
101
+ - Pyramid level analysis
102
+ - Tile structure and overlap
103
+ - Macro and label images
104
+ - Magnification levels
105
+ - Whole slide statistics
106
+ - Region detection
107
+
108
+ ### .ims - Imaris Format
109
+ **Description:** Bitplane Imaris HDF5-based format
110
+ **Typical Data:** Large 3D/4D microscopy datasets
111
+ **Use Cases:** 3D rendering, time-lapse analysis
112
+ **Python Libraries:**
113
+ - `h5py`: Direct HDF5 access
114
+ - `imaris_ims_file_reader`: Specialized reader
115
+ **EDA Approach:**
116
+ - Resolution level analysis
117
+ - Time point structure
118
+ - Channel organization
119
+ - Dataset hierarchy
120
+ - Thumbnail generation
121
+ - Memory-mapped access strategies
122
+ - Chunking optimization
123
+
124
+ ### .lsm - Zeiss LSM
125
+ **Description:** Legacy Zeiss confocal format
126
+ **Typical Data:** Confocal laser scanning microscopy
127
+ **Use Cases:** Older Zeiss confocal data
128
+ **Python Libraries:**
129
+ - `tifffile`: LSM support (TIFF-based)
130
+ - `python-bioformats`: LSM reading
131
+ **EDA Approach:**
132
+ - Similar to TIFF with LSM-specific metadata
133
+ - Scan speed and resolution
134
+ - Laser lines and power
135
+ - Detector gain and offset
136
+ - LUT information
137
+
138
+ ### .stk - MetaMorph Stack
139
+ **Description:** MetaMorph image stack format
140
+ **Typical Data:** Time-lapse or z-stack sequences
141
+ **Use Cases:** MetaMorph software output
142
+ **Python Libraries:**
143
+ - `tifffile`: STK is TIFF-based
144
+ - `python-bioformats`: STK support
145
+ **EDA Approach:**
146
+ - Stack dimensionality
147
+ - Plane metadata
148
+ - Timing information
149
+ - Stage positions
150
+ - UIC tags parsing
151
+
152
+ ### .dv - DeltaVision
153
+ **Description:** Applied Precision DeltaVision format
154
+ **Typical Data:** Deconvolution microscopy
155
+ **Use Cases:** DeltaVision microscope data
156
+ **Python Libraries:**
157
+ - `mrc`: Can read DV (MRC-related)
158
+ - `AICSImageIO`: DV support
159
+ **EDA Approach:**
160
+ - Wave information (channels)
161
+ - Extended header analysis
162
+ - Lens and magnification
163
+ - Deconvolution status
164
+ - Time stamps per section
165
+
166
+ ### .mrc - Medical Research Council
167
+ **Description:** Electron microscopy format
168
+ **Typical Data:** EM images, cryo-EM, tomography
169
+ **Use Cases:** Structural biology, electron microscopy
170
+ **Python Libraries:**
171
+ - `mrcfile`: `mrcfile.open('file.mrc')`
172
+ - `EMAN2`: EM-specific tools
173
+ **EDA Approach:**
174
+ - Volume dimensions
175
+ - Voxel size and units
176
+ - Origin and map statistics
177
+ - Symmetry information
178
+ - Extended header analysis
179
+ - Density statistics
180
+ - Header consistency validation
181
+
182
+ ### .dm3 / .dm4 - Gatan Digital Micrograph
183
+ **Description:** Gatan TEM/STEM format
184
+ **Typical Data:** Transmission electron microscopy
185
+ **Use Cases:** TEM imaging and analysis
186
+ **Python Libraries:**
187
+ - `hyperspy`: `hs.load('file.dm3')`
188
+ - `ncempy`: `ncempy.io.dm.dmReader('file.dm3')`
189
+ **EDA Approach:**
190
+ - Microscope parameters
191
+ - Energy dispersive spectroscopy data
192
+ - Diffraction patterns
193
+ - Calibration information
194
+ - Tag structure analysis
195
+ - Image series handling
196
+
197
+ ### .eer - Electron Event Representation
198
+ **Description:** Direct electron detector format
199
+ **Typical Data:** Electron counting data from detectors
200
+ **Use Cases:** Cryo-EM data collection
201
+ **Python Libraries:**
202
+ - `mrcfile`: Some EER support
203
+ - Vendor-specific tools (Gatan, TFS)
204
+ **EDA Approach:**
205
+ - Event counting statistics
206
+ - Frame rate and dose
207
+ - Detector configuration
208
+ - Motion correction assessment
209
+ - Gain reference validation
210
+
211
+ ### .ser - TIA Series
212
+ **Description:** FEI/TFS TIA format
213
+ **Typical Data:** EM image series
214
+ **Use Cases:** FEI/Thermo Fisher EM data
215
+ **Python Libraries:**
216
+ - `hyperspy`: SER support
217
+ - `ncempy`: TIA reader
218
+ **EDA Approach:**
219
+ - Series structure
220
+ - Calibration data
221
+ - Acquisition metadata
222
+ - Time stamps
223
+ - Multi-dimensional data organization
224
+
225
+ ## Medical and Biological Imaging
226
+
227
+ ### .dcm - DICOM
228
+ **Description:** Digital Imaging and Communications in Medicine
229
+ **Typical Data:** Medical images with patient/study metadata
230
+ **Use Cases:** Clinical imaging, radiology, CT, MRI, PET
231
+ **Python Libraries:**
232
+ - `pydicom`: `pydicom.dcmread('file.dcm')`
233
+ - `SimpleITK`: `sitk.ReadImage('file.dcm')`
234
+ - `nibabel`: Limited DICOM support
235
+ **EDA Approach:**
236
+ - Patient metadata extraction (anonymization check)
237
+ - Modality-specific analysis
238
+ - Series and study organization
239
+ - Slice thickness and spacing
240
+ - Window/level settings
241
+ - Hounsfield units (CT)
242
+ - Image orientation and position
243
+ - Multi-frame analysis
244
+
245
+ ### .nii / .nii.gz - NIfTI
246
+ **Description:** Neuroimaging Informatics Technology Initiative
247
+ **Typical Data:** Brain imaging, fMRI, structural MRI
248
+ **Use Cases:** Neuroimaging research, brain analysis
249
+ **Python Libraries:**
250
+ - `nibabel`: `nibabel.load('file.nii')`
251
+ - `nilearn`: Neuroimaging with ML
252
+ - `SimpleITK`: NIfTI support
253
+ **EDA Approach:**
254
+ - Volume dimensions and voxel size
255
+ - Affine transformation matrix
256
+ - Time series analysis (fMRI)
257
+ - Intensity distribution
258
+ - Brain extraction quality
259
+ - Registration assessment
260
+ - Orientation validation
261
+ - Header information consistency
262
+
263
+ ### .mnc - MINC Format
264
+ **Description:** Medical Image NetCDF
265
+ **Typical Data:** Medical imaging (predecessor to NIfTI)
266
+ **Use Cases:** Legacy neuroimaging data
267
+ **Python Libraries:**
268
+ - `pyminc`: MINC-specific tools
269
+ - `nibabel`: MINC support
270
+ **EDA Approach:**
271
+ - Similar to NIfTI
272
+ - NetCDF structure exploration
273
+ - Dimension ordering
274
+ - Metadata extraction
275
+
276
+ ### .nrrd - Nearly Raw Raster Data
277
+ **Description:** Medical imaging format with detached header
278
+ **Typical Data:** Medical images, research imaging
279
+ **Use Cases:** 3D Slicer, ITK-based applications
280
+ **Python Libraries:**
281
+ - `pynrrd`: `nrrd.read('file.nrrd')`
282
+ - `SimpleITK`: NRRD support
283
+ **EDA Approach:**
284
+ - Header field analysis
285
+ - Encoding format
286
+ - Dimension and spacing
287
+ - Orientation matrix
288
+ - Compression assessment
289
+ - Endianness handling
290
+
291
+ ### .mha / .mhd - MetaImage
292
+ **Description:** MetaImage format (ITK)
293
+ **Typical Data:** Medical/scientific 3D images
294
+ **Use Cases:** ITK/SimpleITK applications
295
+ **Python Libraries:**
296
+ - `SimpleITK`: Native MHA/MHD support
297
+ - `itk`: Direct ITK integration
298
+ **EDA Approach:**
299
+ - Header-data file pairing (MHD)
300
+ - Transform matrix
301
+ - Element spacing
302
+ - Compression format
303
+ - Data type and dimensions
304
+
305
+ ### .hdr / .img - Analyze Format
306
+ **Description:** Legacy medical imaging format
307
+ **Typical Data:** Brain imaging (pre-NIfTI)
308
+ **Use Cases:** Old neuroimaging datasets
309
+ **Python Libraries:**
310
+ - `nibabel`: Analyze support
311
+ - Conversion to NIfTI recommended
312
+ **EDA Approach:**
313
+ - Header-image pairing validation
314
+ - Byte order issues
315
+ - Conversion to modern formats
316
+ - Metadata limitations
317
+
318
+ ## Scientific Image Formats
319
+
320
+ ### .png - Portable Network Graphics
321
+ **Description:** Lossless compressed image format
322
+ **Typical Data:** 2D images, screenshots, processed data
323
+ **Use Cases:** Publication figures, lossless storage
324
+ **Python Libraries:**
325
+ - `PIL/Pillow`: `Image.open('file.png')`
326
+ - `scikit-image`: `io.imread('file.png')`
327
+ - `imageio`: `imageio.imread('file.png')`
328
+ **EDA Approach:**
329
+ - Bit depth analysis (8-bit, 16-bit)
330
+ - Color mode (grayscale, RGB, palette)
331
+ - Metadata (PNG chunks)
332
+ - Transparency handling
333
+ - Compression efficiency
334
+ - Histogram analysis
335
+
336
+ ### .jpg / .jpeg - Joint Photographic Experts Group
337
+ **Description:** Lossy compressed image format
338
+ **Typical Data:** Natural images, photos
339
+ **Use Cases:** Visualization, web graphics (not raw data)
340
+ **Python Libraries:**
341
+ - `PIL/Pillow`: Standard JPEG support
342
+ - `scikit-image`: JPEG reading
343
+ **EDA Approach:**
344
+ - Compression artifacts detection
345
+ - Quality factor estimation
346
+ - Color space (RGB, grayscale)
347
+ - EXIF metadata
348
+ - Quantization table analysis
349
+ - Note: Not suitable for quantitative analysis
350
+
351
+ ### .bmp - Bitmap Image
352
+ **Description:** Uncompressed raster image
353
+ **Typical Data:** Simple images, screenshots
354
+ **Use Cases:** Compatibility, simple storage
355
+ **Python Libraries:**
356
+ - `PIL/Pillow`: BMP support
357
+ - `scikit-image`: BMP reading
358
+ **EDA Approach:**
359
+ - Color depth
360
+ - Palette analysis (if indexed)
361
+ - File size efficiency
362
+ - Pixel format validation
363
+
364
+ ### .gif - Graphics Interchange Format
365
+ **Description:** Image format with animation support
366
+ **Typical Data:** Animated images, simple graphics
367
+ **Use Cases:** Animations, time-lapse visualization
368
+ **Python Libraries:**
369
+ - `PIL/Pillow`: GIF support
370
+ - `imageio`: Better GIF animation support
371
+ **EDA Approach:**
372
+ - Frame count and timing
373
+ - Palette limitations (256 colors)
374
+ - Loop count
375
+ - Disposal method
376
+ - Transparency handling
377
+
378
+ ### .svg - Scalable Vector Graphics
379
+ **Description:** XML-based vector graphics
380
+ **Typical Data:** Vector drawings, plots, diagrams
381
+ **Use Cases:** Publication-quality figures, plots
382
+ **Python Libraries:**
383
+ - `svgpathtools`: Path manipulation
384
+ - `cairosvg`: Rasterization
385
+ - `lxml`: XML parsing
386
+ **EDA Approach:**
387
+ - Element structure analysis
388
+ - Style information
389
+ - Viewbox and dimensions
390
+ - Path complexity
391
+ - Text element extraction
392
+ - Layer organization
393
+
394
+ ### .eps - Encapsulated PostScript
395
+ **Description:** Vector graphics format
396
+ **Typical Data:** Publication figures
397
+ **Use Cases:** Legacy publication graphics
398
+ **Python Libraries:**
399
+ - `PIL/Pillow`: Basic EPS rasterization
400
+ - `ghostscript` via subprocess
401
+ **EDA Approach:**
402
+ - Bounding box information
403
+ - Preview image validation
404
+ - Font embedding
405
+ - Conversion to modern formats
406
+
407
+ ### .pdf (Images)
408
+ **Description:** Portable Document Format with images
409
+ **Typical Data:** Publication figures, multi-page documents
410
+ **Use Cases:** Publication, data presentation
411
+ **Python Libraries:**
412
+ - `PyMuPDF/fitz`: `fitz.open('file.pdf')`
413
+ - `pdf2image`: Rasterization
414
+ - `pdfplumber`: Text and layout extraction
415
+ **EDA Approach:**
416
+ - Page count
417
+ - Image extraction
418
+ - Resolution and DPI
419
+ - Embedded fonts and metadata
420
+ - Compression methods
421
+ - Image vs vector content
422
+
423
+ ### .fig - MATLAB Figure
424
+ **Description:** MATLAB figure file
425
+ **Typical Data:** MATLAB plots and figures
426
+ **Use Cases:** MATLAB data visualization
427
+ **Python Libraries:**
428
+ - Custom parsers (MAT file structure)
429
+ - Conversion to other formats
430
+ **EDA Approach:**
431
+ - Figure structure
432
+ - Data extraction from plots
433
+ - Axes and label information
434
+ - Plot type identification
435
+
436
+ ### .hdf5 (Imaging Specific)
437
+ **Description:** HDF5 for large imaging datasets
438
+ **Typical Data:** High-content screening, large microscopy
439
+ **Use Cases:** BigDataViewer, large-scale imaging
440
+ **Python Libraries:**
441
+ - `h5py`: Universal HDF5 access
442
+ - Imaging-specific readers (BigDataViewer)
443
+ **EDA Approach:**
444
+ - Dataset hierarchy
445
+ - Chunk and compression strategy
446
+ - Multi-resolution pyramid
447
+ - Metadata organization
448
+ - Memory-mapped access
449
+ - Parallel I/O performance
450
+
451
+ ### .zarr - Chunked Array Storage
452
+ **Description:** Cloud-optimized array storage
453
+ **Typical Data:** Large imaging datasets, OME-ZARR
454
+ **Use Cases:** Cloud microscopy, large-scale analysis
455
+ **Python Libraries:**
456
+ - `zarr`: `zarr.open('file.zarr')`
457
+ - `ome-zarr-py`: OME-ZARR support
458
+ **EDA Approach:**
459
+ - Chunk size optimization
460
+ - Compression codec analysis
461
+ - Multi-scale representation
462
+ - Array dimensions and dtype
463
+ - Metadata structure (OME)
464
+ - Cloud access patterns
465
+
466
+ ### .raw - Raw Image Data
467
+ **Description:** Unformatted binary pixel data
468
+ **Typical Data:** Raw detector output
469
+ **Use Cases:** Custom imaging systems
470
+ **Python Libraries:**
471
+ - `numpy`: `np.fromfile()` with dtype
472
+ - `imageio`: Raw format plugins
473
+ **EDA Approach:**
474
+ - Dimensions determination (external info needed)
475
+ - Byte order and data type
476
+ - Header presence detection
477
+ - Pixel value range
478
+ - Noise characteristics
479
+
480
+ ### .bin - Binary Image Data
481
+ **Description:** Generic binary image format
482
+ **Typical Data:** Raw or custom-formatted images
483
+ **Use Cases:** Instrument-specific outputs
484
+ **Python Libraries:**
485
+ - `numpy`: Custom binary reading
486
+ - `struct`: For structured binary data
487
+ **EDA Approach:**
488
+ - Format specification required
489
+ - Header parsing (if present)
490
+ - Data type inference
491
+ - Dimension extraction
492
+ - Validation with known parameters
493
+
494
+ ## Image Analysis Formats
495
+
496
+ ### .roi - ImageJ ROI
497
+ **Description:** ImageJ region of interest format
498
+ **Typical Data:** Geometric ROIs, selections
499
+ **Use Cases:** ImageJ/Fiji analysis workflows
500
+ **Python Libraries:**
501
+ - `read-roi`: `read_roi.read_roi_file('file.roi')`
502
+ - `roifile`: ROI manipulation
503
+ **EDA Approach:**
504
+ - ROI type analysis (rectangle, polygon, etc.)
505
+ - Coordinate extraction
506
+ - ROI properties (area, perimeter)
507
+ - Group analysis (ROI sets)
508
+ - Z-position and time information
509
+
510
+ ### .zip (ROI sets)
511
+ **Description:** ZIP archive of ImageJ ROIs
512
+ **Typical Data:** Multiple ROI files
513
+ **Use Cases:** Batch ROI analysis
514
+ **Python Libraries:**
515
+ - `read-roi`: `read_roi.read_roi_zip('file.zip')`
516
+ - Standard `zipfile` module
517
+ **EDA Approach:**
518
+ - ROI count in set
519
+ - ROI type distribution
520
+ - Spatial distribution
521
+ - Overlapping ROI detection
522
+ - Naming conventions
523
+
524
+ ### .ome.tif / .ome.tiff - OME-TIFF
525
+ **Description:** TIFF with OME-XML metadata
526
+ **Typical Data:** Standardized microscopy with rich metadata
527
+ **Use Cases:** Bio-Formats compatible storage
528
+ **Python Libraries:**
529
+ - `tifffile`: OME-TIFF support
530
+ - `AICSImageIO`: OME reading
531
+ - `python-bioformats`: Bio-Formats integration
532
+ **EDA Approach:**
533
+ - OME-XML validation
534
+ - Physical dimensions extraction
535
+ - Channel naming and wavelengths
536
+ - Plane positions (Z, C, T)
537
+ - Instrument metadata
538
+ - Bio-Formats compatibility
539
+
540
+ ### .ome.zarr - OME-ZARR
541
+ **Description:** OME-NGFF specification on ZARR
542
+ **Typical Data:** Next-generation file format for bioimaging
543
+ **Use Cases:** Cloud-native imaging, large datasets
544
+ **Python Libraries:**
545
+ - `ome-zarr-py`: Official implementation
546
+ - `zarr`: Underlying array storage
547
+ **EDA Approach:**
548
+ - Multiscale resolution levels
549
+ - Metadata compliance with OME-NGFF spec
550
+ - Coordinate transformations
551
+ - Label and ROI handling
552
+ - Cloud storage optimization
553
+ - Chunk access patterns
554
+
555
+ ### .klb - Keller Lab Block
556
+ **Description:** Fast microscopy format for large data
557
+ **Typical Data:** Lightsheet microscopy, time-lapse
558
+ **Use Cases:** High-throughput imaging
559
+ **Python Libraries:**
560
+ - `pyklb`: KLB reading and writing
561
+ **EDA Approach:**
562
+ - Compression efficiency
563
+ - Block structure
564
+ - Multi-resolution support
565
+ - Read performance benchmarking
566
+ - Metadata extraction
567
+
568
+ ### .vsi - Whole Slide Imaging
569
+ **Description:** Virtual slide format (multiple vendors)
570
+ **Typical Data:** Pathology slides, large mosaics
571
+ **Use Cases:** Digital pathology
572
+ **Python Libraries:**
573
+ - `openslide-python`: Multi-format WSI
574
+ - `tiffslide`: Pure Python alternative
575
+ **EDA Approach:**
576
+ - Pyramid level count
577
+ - Downsampling factors
578
+ - Associated images (macro, label)
579
+ - Tile size and overlap
580
+ - MPP (microns per pixel)
581
+ - Background detection
582
+ - Tissue segmentation
583
+
584
+ ### .ndpi - Hamamatsu NanoZoomer
585
+ **Description:** Hamamatsu slide scanner format
586
+ **Typical Data:** Whole slide pathology images
587
+ **Use Cases:** Digital pathology workflows
588
+ **Python Libraries:**
589
+ - `openslide-python`: NDPI support
590
+ **EDA Approach:**
591
+ - Multi-resolution pyramid
592
+ - Lens and objective information
593
+ - Scan area and magnification
594
+ - Focal plane information
595
+ - Tissue detection
596
+
597
+ ### .svs - Aperio ScanScope
598
+ **Description:** Aperio whole slide format
599
+ **Typical Data:** Digital pathology slides
600
+ **Use Cases:** Pathology image analysis
601
+ **Python Libraries:**
602
+ - `openslide-python`: SVS support
603
+ **EDA Approach:**
604
+ - Pyramid structure
605
+ - MPP calibration
606
+ - Label and macro images
607
+ - Compression quality
608
+ - Thumbnail generation
609
+
610
+ ### .scn - Leica SCN
611
+ **Description:** Leica slide scanner format
612
+ **Typical Data:** Whole slide imaging
613
+ **Use Cases:** Digital pathology
614
+ **Python Libraries:**
615
+ - `openslide-python`: SCN support
616
+ **EDA Approach:**
617
+ - Tile structure analysis
618
+ - Collection organization
619
+ - Metadata extraction
620
+ - Magnification levels
.scider/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Proteomics and Metabolomics File Formats Reference
2
+
3
+ This reference covers file formats specific to proteomics, metabolomics, lipidomics, and related omics workflows.
4
+
5
+ ## Mass Spectrometry-Based Proteomics
6
+
7
+ ### .mzML - Mass Spectrometry Markup Language
8
+ **Description:** Standard XML format for MS data
9
+ **Typical Data:** MS1 and MS2 spectra, retention times, intensities
10
+ **Use Cases:** Proteomics, metabolomics pipelines
11
+ **Python Libraries:**
12
+ - `pymzml`: `pymzml.run.Reader('file.mzML')`
13
+ - `pyteomics.mzml`: `pyteomics.mzml.read('file.mzML')`
14
+ - `pyopenms`: OpenMS Python bindings
15
+ **EDA Approach:**
16
+ - Scan count and MS level distribution
17
+ - Total ion chromatogram (TIC) analysis
18
+ - Base peak chromatogram (BPC)
19
+ - m/z coverage and resolution
20
+ - Retention time range
21
+ - Precursor selection patterns
22
+ - Data completeness
23
+ - Quality control metrics (lock mass, standards)
24
+
25
+ ### .mzXML - Legacy MS XML Format
26
+ **Description:** Older XML-based MS format
27
+ **Typical Data:** Mass spectra with metadata
28
+ **Use Cases:** Legacy proteomics data
29
+ **Python Libraries:**
30
+ - `pyteomics.mzxml`
31
+ - `pymzml`: Can read mzXML
32
+ **EDA Approach:**
33
+ - Similar to mzML
34
+ - Format version compatibility
35
+ - Conversion quality validation
36
+ - Metadata preservation check
37
+
38
+ ### .mzIdentML - Peptide Identification Format
39
+ **Description:** PSI standard for peptide identifications
40
+ **Typical Data:** Peptide-spectrum matches, proteins, scores
41
+ **Use Cases:** Search engine results, proteomics workflows
42
+ **Python Libraries:**
43
+ - `pyteomics.mzid`
44
+ - `pyopenms`: MzIdentML support
45
+ **EDA Approach:**
46
+ - PSM count and score distribution
47
+ - FDR calculation and filtering
48
+ - Modification analysis
49
+ - Missed cleavage statistics
50
+ - Protein inference results
51
+ - Search parameters validation
52
+ - Decoy hit analysis
53
+ - Rank-1 vs lower ranks
54
+
55
+ ### .pepXML - Trans-Proteomic Pipeline Peptide XML
56
+ **Description:** TPP format for peptide identifications
57
+ **Typical Data:** Search results with statistical validation
58
+ **Use Cases:** Proteomics database search output
59
+ **Python Libraries:**
60
+ - `pyteomics.pepxml`
61
+ **EDA Approach:**
62
+ - Search engine comparison
63
+ - Score distributions (XCorr, expect value, etc.)
64
+ - Charge state analysis
65
+ - Modification frequencies
66
+ - PeptideProphet probabilities
67
+ - Protein coverage
68
+ - Spectral counting
69
+
70
+ ### .protXML - Protein Inference Results
71
+ **Description:** TPP protein-level identifications
72
+ **Typical Data:** Protein groups, probabilities, peptides
73
+ **Use Cases:** Protein-level analysis
74
+ **Python Libraries:**
75
+ - `pyteomics.protxml`
76
+ **EDA Approach:**
77
+ - Protein group statistics
78
+ - Parsimonious protein sets
79
+ - ProteinProphet probabilities
80
+ - Coverage and peptide count per protein
81
+ - Unique vs shared peptides
82
+ - Protein molecular weight distribution
83
+ - GO term enrichment preparation
84
+
85
+ ### .pride.xml - PRIDE XML Format
86
+ **Description:** Proteomics Identifications Database format
87
+ **Typical Data:** Complete proteomics experiment data
88
+ **Use Cases:** Public data deposition (legacy)
89
+ **Python Libraries:**
90
+ - `pyteomics.pride`
91
+ - Custom XML parsers
92
+ **EDA Approach:**
93
+ - Experiment metadata extraction
94
+ - Identification completeness
95
+ - Cross-linking to spectra
96
+ - Protocol information
97
+ - Instrument details
98
+
99
+ ### .tsv / .csv (Proteomics)
100
+ **Description:** Tab or comma-separated proteomics results
101
+ **Typical Data:** Peptide or protein quantification tables
102
+ **Use Cases:** MaxQuant, Proteome Discoverer, Skyline output
103
+ **Python Libraries:**
104
+ - `pandas`: `pd.read_csv()` or `pd.read_table()`
105
+ **EDA Approach:**
106
+ - Identification counts
107
+ - Quantitative value distributions
108
+ - Missing value patterns
109
+ - Intensity-based analysis
110
+ - Label-free quantification assessment
111
+ - Isobaric tag ratio analysis
112
+ - Coefficient of variation
113
+ - Batch effects
114
+
115
+ ### .msf - Thermo MSF Database
116
+ **Description:** Proteome Discoverer results database
117
+ **Typical Data:** SQLite database with search results
118
+ **Use Cases:** Thermo Proteome Discoverer workflows
119
+ **Python Libraries:**
120
+ - `sqlite3`: Database access
121
+ - Custom MSF parsers
122
+ **EDA Approach:**
123
+ - Database schema exploration
124
+ - Peptide and protein tables
125
+ - Score thresholds
126
+ - Quantification data
127
+ - Processing node information
128
+ - Confidence levels
129
+
130
+ ### .pdResult - Proteome Discoverer Result
131
+ **Description:** Proteome Discoverer study results
132
+ **Typical Data:** Comprehensive search and quantification
133
+ **Use Cases:** PD study exports
134
+ **Python Libraries:**
135
+ - Vendor tools for conversion
136
+ - Export to TSV for Python analysis
137
+ **EDA Approach:**
138
+ - Study design validation
139
+ - Result filtering criteria
140
+ - Quantitative comparison groups
141
+ - Imputation strategies
142
+
143
+ ### .pep.xml - Peptide Summary
144
+ **Description:** Compact peptide identification format
145
+ **Typical Data:** Peptide sequences, modifications, scores
146
+ **Use Cases:** Downstream analysis input
147
+ **Python Libraries:**
148
+ - `pyteomics`: XML parsing
149
+ **EDA Approach:**
150
+ - Unique peptide counting
151
+ - PTM site localization
152
+ - Retention time predictability
153
+ - Charge state preferences
154
+
155
+ ## Quantitative Proteomics
156
+
157
+ ### .sky - Skyline Document
158
+ **Description:** Skyline targeted proteomics document
159
+ **Typical Data:** Transition lists, chromatograms, results
160
+ **Use Cases:** Targeted proteomics (SRM/MRM/PRM)
161
+ **Python Libraries:**
162
+ - `skyline`: Python API (limited)
163
+ - Export to CSV for analysis
164
+ **EDA Approach:**
165
+ - Transition selection validation
166
+ - Chromatographic peak quality
167
+ - Interference detection
168
+ - Retention time consistency
169
+ - Calibration curve assessment
170
+ - Replicate correlation
171
+ - LOD/LOQ determination
172
+
173
+ ### .sky.zip - Zipped Skyline Document
174
+ **Description:** Skyline document with external files
175
+ **Typical Data:** Complete Skyline analysis
176
+ **Use Cases:** Sharing Skyline projects
177
+ **Python Libraries:**
178
+ - `zipfile`: Extract for processing
179
+ **EDA Approach:**
180
+ - Document structure
181
+ - External file references
182
+ - Result export and analysis
183
+
184
+ ### .wiff - SCIEX WIFF Format
185
+ **Description:** SCIEX instrument data with quantitation
186
+ **Typical Data:** LC-MS/MS with MRM transitions
187
+ **Use Cases:** SCIEX QTRAP, TripleTOF data
188
+ **Python Libraries:**
189
+ - Vendor tools (limited Python access)
190
+ - Conversion to mzML
191
+ **EDA Approach:**
192
+ - MRM transition performance
193
+ - Dwell time optimization
194
+ - Cycle time analysis
195
+ - Peak integration quality
196
+
197
+ ### .raw (Thermo)
198
+ **Description:** Thermo raw instrument file
199
+ **Typical Data:** Full MS data from Orbitrap, Q Exactive
200
+ **Use Cases:** Label-free and TMT quantification
201
+ **Python Libraries:**
202
+ - `pymsfilereader`: Thermo RawFileReader
203
+ - `ThermoRawFileParser`: Cross-platform CLI
204
+ **EDA Approach:**
205
+ - MS1 and MS2 acquisition rates
206
+ - AGC target and fill times
207
+ - Resolution settings
208
+ - Isolation window validation
209
+ - SPS ion selection (TMT)
210
+ - Contamination assessment
211
+
212
+ ### .d (Agilent)
213
+ **Description:** Agilent data directory
214
+ **Typical Data:** LC-MS and GC-MS data
215
+ **Use Cases:** Agilent instrument workflows
216
+ **Python Libraries:**
217
+ - Community parsers
218
+ - Export to mzML
219
+ **EDA Approach:**
220
+ - Method consistency
221
+ - Calibration status
222
+ - Sequence run information
223
+ - Retention time stability
224
+
225
+ ## Metabolomics and Lipidomics
226
+
227
+ ### .mzML (Metabolomics)
228
+ **Description:** Standard MS format for metabolomics
229
+ **Typical Data:** Full scan MS, targeted MS/MS
230
+ **Use Cases:** Untargeted and targeted metabolomics
231
+ **Python Libraries:**
232
+ - Same as proteomics mzML tools
233
+ **EDA Approach:**
234
+ - Feature detection quality
235
+ - Mass accuracy assessment
236
+ - Retention time alignment
237
+ - Blank subtraction
238
+ - QC sample consistency
239
+ - Isotope pattern validation
240
+ - Adduct formation analysis
241
+ - In-source fragmentation check
242
+
243
+ ### .cdf / .netCDF - ANDI-MS
244
+ **Description:** Analytical Data Interchange for MS
245
+ **Typical Data:** GC-MS, LC-MS chromatography data
246
+ **Use Cases:** Metabolomics, GC-MS workflows
247
+ **Python Libraries:**
248
+ - `netCDF4`: Low-level access
249
+ - `pyopenms`: CDF support
250
+ - `xcms` via R integration
251
+ **EDA Approach:**
252
+ - TIC and extracted ion chromatograms
253
+ - Peak detection across samples
254
+ - Retention index calculation
255
+ - Mass spectral matching
256
+ - Library search preparation
257
+
258
+ ### .msp - Mass Spectral Format (NIST)
259
+ **Description:** NIST spectral library format
260
+ **Typical Data:** Reference mass spectra
261
+ **Use Cases:** Metabolite identification, library matching
262
+ **Python Libraries:**
263
+ - `matchms`: Spectral matching
264
+ - Custom MSP parsers
265
+ **EDA Approach:**
266
+ - Library coverage
267
+ - Metadata completeness (InChI, SMILES)
268
+ - Spectral quality metrics
269
+ - Collision energy standardization
270
+ - Precursor type annotation
271
+
272
+ ### .mgf (Metabolomics)
273
+ **Description:** Mascot Generic Format for MS/MS
274
+ **Typical Data:** MS/MS spectra for metabolite ID
275
+ **Use Cases:** Spectral library searching
276
+ **Python Libraries:**
277
+ - `matchms`: Metabolomics spectral analysis
278
+ - `pyteomics.mgf`
279
+ **EDA Approach:**
280
+ - Spectrum quality filtering
281
+ - Precursor isolation purity
282
+ - Fragment m/z accuracy
283
+ - Neutral loss patterns
284
+ - MS/MS completeness
285
+
286
+ ### .nmrML - NMR Markup Language
287
+ **Description:** Standard XML format for NMR metabolomics
288
+ **Typical Data:** 1D/2D NMR spectra with metadata
289
+ **Use Cases:** NMR-based metabolomics
290
+ **Python Libraries:**
291
+ - `nmrml2isa`: Format conversion
292
+ - Custom XML parsers
293
+ **EDA Approach:**
294
+ - Spectral quality metrics
295
+ - Binning consistency
296
+ - Reference compound validation
297
+ - pH and temperature effects
298
+ - Metabolite identification confidence
299
+
300
+ ### .json (Metabolomics)
301
+ **Description:** JSON format for metabolomics results
302
+ **Typical Data:** Feature tables, annotations, metadata
303
+ **Use Cases:** GNPS, MetaboAnalyst, web tools
304
+ **Python Libraries:**
305
+ - `json`: Standard library
306
+ - `pandas`: JSON normalization
307
+ **EDA Approach:**
308
+ - Feature annotation coverage
309
+ - GNPS clustering results
310
+ - Molecular networking statistics
311
+ - Adduct and in-source fragment linkage
312
+ - Putative identification confidence
313
+
314
+ ### .txt (Metabolomics Tables)
315
+ **Description:** Tab-delimited feature tables
316
+ **Typical Data:** m/z, RT, intensities across samples
317
+ **Use Cases:** MZmine, XCMS, MS-DIAL output
318
+ **Python Libraries:**
319
+ - `pandas`: Text file reading
320
+ **EDA Approach:**
321
+ - Feature count and quality
322
+ - Missing value imputation
323
+ - Data normalization assessment
324
+ - Batch correction validation
325
+ - PCA and clustering for QC
326
+ - Fold change calculations
327
+ - Statistical test preparation
328
+
329
+ ### .featureXML - OpenMS Feature Format
330
+ **Description:** OpenMS detected features
331
+ **Typical Data:** LC-MS features with quality scores
332
+ **Use Cases:** OpenMS workflows
333
+ **Python Libraries:**
334
+ - `pyopenms`: FeatureXML support
335
+ **EDA Approach:**
336
+ - Feature detection parameters
337
+ - Quality metrics per feature
338
+ - Isotope pattern fitting
339
+ - Charge state assignment
340
+ - FWHM and asymmetry
341
+
342
+ ### .consensusXML - OpenMS Consensus Features
343
+ **Description:** Linked features across samples
344
+ **Typical Data:** Aligned features with group info
345
+ **Use Cases:** Multi-sample LC-MS analysis
346
+ **Python Libraries:**
347
+ - `pyopenms`: ConsensusXML reading
348
+ **EDA Approach:**
349
+ - Feature correspondence quality
350
+ - Retention time alignment
351
+ - Missing value patterns
352
+ - Intensity normalization needs
353
+ - Batch-wise feature agreement
354
+
355
+ ### .idXML - OpenMS Identification Format
356
+ **Description:** Peptide/metabolite identifications
357
+ **Typical Data:** MS/MS identifications with scores
358
+ **Use Cases:** OpenMS ID workflows
359
+ **Python Libraries:**
360
+ - `pyopenms`: IdXML support
361
+ **EDA Approach:**
362
+ - Identification rate
363
+ - Score distribution
364
+ - Spectral match quality
365
+ - False discovery assessment
366
+ - Annotation transfer validation
367
+
368
+ ## Lipidomics-Specific Formats
369
+
370
+ ### .lcb - LipidCreator Batch
371
+ **Description:** LipidCreator transition list
372
+ **Typical Data:** Lipid transitions for targeted MS
373
+ **Use Cases:** Targeted lipidomics
374
+ **Python Libraries:**
375
+ - Export to CSV for processing
376
+ **EDA Approach:**
377
+ - Transition coverage per lipid class
378
+ - Retention time prediction
379
+ - Collision energy optimization
380
+ - Class-specific fragmentation patterns
381
+
382
+ ### .mzTab - Proteomics/Metabolomics Tabular Format
383
+ **Description:** PSI tabular summary format
384
+ **Typical Data:** Protein/peptide/metabolite quantification
385
+ **Use Cases:** Publication and data sharing
386
+ **Python Libraries:**
387
+ - `pyteomics.mztab`
388
+ - `pandas` for TSV-like structure
389
+ **EDA Approach:**
390
+ - Data completeness
391
+ - Metadata section validation
392
+ - Quantification method
393
+ - Identification confidence
394
+ - Software and parameters
395
+ - Quality metrics summary
396
+
397
+ ### .csv (LipidSearch, LipidMatch)
398
+ **Description:** Lipid identification results
399
+ **Typical Data:** Lipid annotations, grades, intensities
400
+ **Use Cases:** Lipidomics software output
401
+ **Python Libraries:**
402
+ - `pandas`: CSV reading
403
+ **EDA Approach:**
404
+ - Lipid class distribution
405
+ - Identification grade/confidence
406
+ - Fatty acid composition analysis
407
+ - Double bond and chain length patterns
408
+ - Intensity correlations
409
+ - Normalization to internal standards
410
+
411
+ ### .sdf (Metabolomics)
412
+ **Description:** Structure data file for metabolites
413
+ **Typical Data:** Chemical structures with properties
414
+ **Use Cases:** Metabolite database creation
415
+ **Python Libraries:**
416
+ - `RDKit`: `Chem.SDMolSupplier('file.sdf')`
417
+ **EDA Approach:**
418
+ - Structure validation
419
+ - Property calculation (logP, MW, TPSA)
420
+ - Molecular formula consistency
421
+ - Tautomer enumeration
422
+ - Retention time prediction features
423
+
424
+ ### .mol (Metabolomics)
425
+ **Description:** Single molecule structure files
426
+ **Typical Data:** Metabolite chemical structure
427
+ **Use Cases:** Structure-based searches
428
+ **Python Libraries:**
429
+ - `RDKit`: `Chem.MolFromMolFile('file.mol')`
430
+ **EDA Approach:**
431
+ - Structure correctness
432
+ - Stereochemistry validation
433
+ - Charge state
434
+ - Implicit hydrogen handling
435
+
436
+ ## Data Processing and Analysis
437
+
438
+ ### .h5 / .hdf5 (Omics)
439
+ **Description:** HDF5 for large omics datasets
440
+ **Typical Data:** Feature matrices, spectra, metadata
441
+ **Use Cases:** Large-scale studies, cloud computing
442
+ **Python Libraries:**
443
+ - `h5py`: HDF5 access
444
+ - `anndata`: For single-cell proteomics
445
+ **EDA Approach:**
446
+ - Dataset organization
447
+ - Chunking and compression
448
+ - Metadata structure
449
+ - Efficient data access patterns
450
+ - Sample and feature annotations
451
+
452
+ ### .Rdata / .rds - R Objects
453
+ **Description:** Serialized R analysis objects
454
+ **Typical Data:** Processed omics results from R packages
455
+ **Use Cases:** xcms, CAMERA, MSnbase workflows
456
+ **Python Libraries:**
457
+ - `pyreadr`: `pyreadr.read_r('file.Rdata')`
458
+ - `rpy2`: R-Python integration
459
+ **EDA Approach:**
460
+ - Object structure exploration
461
+ - Data extraction
462
+ - Method parameter review
463
+ - Conversion to Python-native formats
464
+
465
+ ### .mzTab-M - Metabolomics mzTab
466
+ **Description:** mzTab specific to metabolomics
467
+ **Typical Data:** Small molecule quantification
468
+ **Use Cases:** Metabolomics data sharing
469
+ **Python Libraries:**
470
+ - `pyteomics.mztab`: Can parse mzTab-M
471
+ **EDA Approach:**
472
+ - Small molecule evidence
473
+ - Feature quantification
474
+ - Database references (HMDB, KEGG, etc.)
475
+ - Adduct and charge annotation
476
+ - MS level information
477
+
478
+ ### .parquet (Omics)
479
+ **Description:** Columnar storage for large tables
480
+ **Typical Data:** Feature matrices, metadata
481
+ **Use Cases:** Efficient big data omics
482
+ **Python Libraries:**
483
+ - `pandas`: `pd.read_parquet()`
484
+ - `pyarrow`: Direct parquet access
485
+ **EDA Approach:**
486
+ - Compression efficiency
487
+ - Column-wise statistics
488
+ - Partition structure
489
+ - Schema validation
490
+ - Fast filtering and aggregation
491
+
492
+ ### .pkl (Omics Models)
493
+ **Description:** Pickled Python objects
494
+ **Typical Data:** ML models, processed data
495
+ **Use Cases:** Workflow intermediate storage
496
+ **Python Libraries:**
497
+ - `pickle`: Standard serialization
498
+ - `joblib`: Enhanced pickling
499
+ **EDA Approach:**
500
+ - Object type and structure
501
+ - Model parameters
502
+ - Feature importance (if ML model)
503
+ - Data shapes and types
504
+ - Deserialization validation
505
+
506
+ ### .zarr (Omics)
507
+ **Description:** Chunked, compressed array storage
508
+ **Typical Data:** Multi-dimensional omics data
509
+ **Use Cases:** Cloud-optimized analysis
510
+ **Python Libraries:**
511
+ - `zarr`: Array storage
512
+ **EDA Approach:**
513
+ - Chunk optimization
514
+ - Compression codecs
515
+ - Multi-scale data
516
+ - Parallel access patterns
517
+ - Metadata annotations
.scider/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spectroscopy and Analytical Chemistry File Formats Reference
2
+
3
+ This reference covers file formats used in various spectroscopic techniques and analytical chemistry instrumentation.
4
+
5
+ ## NMR Spectroscopy
6
+
7
+ ### .fid - NMR Free Induction Decay
8
+ **Description:** Raw time-domain NMR data from Bruker, Agilent, JEOL
9
+ **Typical Data:** Complex time-domain signal
10
+ **Use Cases:** NMR spectroscopy, structure elucidation
11
+ **Python Libraries:**
12
+ - `nmrglue`: `nmrglue.bruker.read_fid('fid')` or `nmrglue.varian.read_fid('fid')`
13
+ - `nmrstarlib`: NMR data handling
14
+ **EDA Approach:**
15
+ - Time-domain signal decay
16
+ - Sampling rate and acquisition time
17
+ - Number of data points
18
+ - Signal-to-noise ratio estimation
19
+ - Baseline drift assessment
20
+ - Digital filter effects
21
+ - Acquisition parameter validation
22
+ - Apodization function selection
23
+
24
+ ### .ft / .ft1 / .ft2 - NMR Frequency Domain
25
+ **Description:** Fourier-transformed NMR spectrum
26
+ **Typical Data:** Processed frequency-domain data
27
+ **Use Cases:** NMR analysis, peak integration
28
+ **Python Libraries:**
29
+ - `nmrglue`: Frequency domain reading
30
+ - Custom processing pipelines
31
+ **EDA Approach:**
32
+ - Peak picking and integration
33
+ - Chemical shift range
34
+ - Baseline correction quality
35
+ - Phase correction assessment
36
+ - Reference peak identification
37
+ - Spectral resolution
38
+ - Artifacts detection
39
+ - Multiplicity analysis
40
+
41
+ ### .1r / .2rr - Bruker NMR Processed Data
42
+ **Description:** Bruker processed spectrum (real part)
43
+ **Typical Data:** 1D or 2D processed NMR spectra
44
+ **Use Cases:** NMR data analysis with Bruker software
45
+ **Python Libraries:**
46
+ - `nmrglue`: Bruker format support
47
+ **EDA Approach:**
48
+ - Processing parameters review
49
+ - Window function effects
50
+ - Zero-filling assessment
51
+ - Linear prediction validation
52
+ - Spectral artifacts
53
+
54
+ ### .dx - NMR JCAMP-DX
55
+ **Description:** JCAMP-DX format for NMR
56
+ **Typical Data:** Standardized NMR spectrum
57
+ **Use Cases:** Data exchange between software
58
+ **Python Libraries:**
59
+ - `jcamp`: JCAMP reader
60
+ - `nmrglue`: Can import JCAMP
61
+ **EDA Approach:**
62
+ - Format compliance
63
+ - Metadata completeness
64
+ - Peak table validation
65
+ - Integration values
66
+ - Compound identification info
67
+
68
+ ### .mnova - Mnova Format
69
+ **Description:** Mestrelab Research Mnova format
70
+ **Typical Data:** NMR data with processing info
71
+ **Use Cases:** Mnova software workflows
72
+ **Python Libraries:**
73
+ - `nmrglue`: Limited Mnova support
74
+ - Conversion tools to standard formats
75
+ **EDA Approach:**
76
+ - Multi-spectrum handling
77
+ - Processing pipeline review
78
+ - Quantification data
79
+ - Structure assignment
80
+
81
+ ## Mass Spectrometry
82
+
83
+ ### .mzML - Mass Spectrometry Markup Language
84
+ **Description:** Standard XML-based MS format
85
+ **Typical Data:** MS spectra, chromatograms, metadata
86
+ **Use Cases:** Proteomics, metabolomics, lipidomics
87
+ **Python Libraries:**
88
+ - `pymzml`: `pymzml.run.Reader('file.mzML')`
89
+ - `pyteomics.mzml`: `pyteomics.mzml.read('file.mzML')`
90
+ - `MSFileReader`: Various wrappers
91
+ **EDA Approach:**
92
+ - Scan count and MS level distribution
93
+ - Retention time range and TIC
94
+ - m/z range and resolution
95
+ - Precursor ion selection
96
+ - Fragmentation patterns
97
+ - Instrument configuration
98
+ - Quality control metrics
99
+ - Data completeness
100
+
101
+ ### .mzXML - Mass Spectrometry XML
102
+ **Description:** Legacy XML MS format
103
+ **Typical Data:** Mass spectra and chromatograms
104
+ **Use Cases:** Proteomics workflows (older)
105
+ **Python Libraries:**
106
+ - `pyteomics.mzxml`
107
+ - `pymzml`: Can read mzXML
108
+ **EDA Approach:**
109
+ - Similar to mzML
110
+ - Version compatibility
111
+ - Conversion quality assessment
112
+
113
+ ### .mzData - mzData Format
114
+ **Description:** Legacy PSI MS format
115
+ **Typical Data:** Mass spectrometry data
116
+ **Use Cases:** Legacy data archives
117
+ **Python Libraries:**
118
+ - `pyteomics`: Limited support
119
+ - Conversion to mzML recommended
120
+ **EDA Approach:**
121
+ - Format conversion validation
122
+ - Data completeness
123
+ - Metadata extraction
124
+
125
+ ### .raw - Vendor Raw Files (Thermo, Agilent, Bruker)
126
+ **Description:** Proprietary instrument data
127
+ **Typical Data:** Raw mass spectra and metadata
128
+ **Use Cases:** Direct instrument output
129
+ **Python Libraries:**
130
+ - `pymsfilereader`: Thermo RAW files
131
+ - `ThermoRawFileParser`: CLI wrapper
132
+ - Vendor-specific APIs
133
+ **EDA Approach:**
134
+ - Method parameter extraction
135
+ - Instrument performance metrics
136
+ - Calibration status
137
+ - Scan function analysis
138
+ - MS/MS quality metrics
139
+ - Dynamic exclusion evaluation
140
+
141
+ ### .d - Agilent Data Directory
142
+ **Description:** Agilent MS data folder
143
+ **Typical Data:** LC-MS, GC-MS with methods
144
+ **Use Cases:** Agilent MassHunter workflows
145
+ **Python Libraries:**
146
+ - Community parsers
147
+ - Chemstation integration
148
+ **EDA Approach:**
149
+ - Directory structure validation
150
+ - Method parameters
151
+ - Calibration curves
152
+ - Sequence metadata
153
+ - Signal quality metrics
154
+
155
+ ### .wiff - AB SCIEX Data
156
+ **Description:** AB SCIEX/SCIEX instrument format
157
+ **Typical Data:** Mass spectrometry data
158
+ **Use Cases:** SCIEX instrument workflows
159
+ **Python Libraries:**
160
+ - Vendor SDKs (limited Python support)
161
+ - Conversion tools
162
+ **EDA Approach:**
163
+ - Experiment type identification
164
+ - Scan properties
165
+ - Quantitation data
166
+ - Multi-experiment structure
167
+
168
+ ### .mgf - Mascot Generic Format
169
+ **Description:** Peak list format for MS/MS
170
+ **Typical Data:** Precursor and fragment masses
171
+ **Use Cases:** Peptide identification, database searches
172
+ **Python Libraries:**
173
+ - `pyteomics.mgf`: `pyteomics.mgf.read('file.mgf')`
174
+ - `pyopenms`: MGF support
175
+ **EDA Approach:**
176
+ - Spectrum count
177
+ - Charge state distribution
178
+ - Precursor m/z and intensity
179
+ - Fragment peak count
180
+ - Mass accuracy
181
+ - Title and metadata parsing
182
+
183
+ ### .pkl - Peak List (Binary)
184
+ **Description:** Binary peak list format
185
+ **Typical Data:** Serialized MS/MS spectra
186
+ **Use Cases:** Software-specific storage
187
+ **Python Libraries:**
188
+ - `pickle`: Standard deserialization
189
+ - `pyteomics`: PKL support
190
+ **EDA Approach:**
191
+ - Data structure inspection
192
+ - Conversion to standard formats
193
+ - Metadata preservation
194
+
195
+ ### .ms1 / .ms2 - MS1/MS2 Formats
196
+ **Description:** Simple text format for MS data
197
+ **Typical Data:** MS1 and MS2 scans
198
+ **Use Cases:** Database searching, proteomics
199
+ **Python Libraries:**
200
+ - `pyteomics.ms1` and `ms2`
201
+ - Simple text parsing
202
+ **EDA Approach:**
203
+ - Scan count by level
204
+ - Retention time series
205
+ - Charge state analysis
206
+ - m/z range coverage
207
+
208
+ ### .pepXML - Peptide XML
209
+ **Description:** TPP peptide identification format
210
+ **Typical Data:** Peptide-spectrum matches
211
+ **Use Cases:** Proteomics search results
212
+ **Python Libraries:**
213
+ - `pyteomics.pepxml`
214
+ **EDA Approach:**
215
+ - Search result statistics
216
+ - Score distribution
217
+ - Modification analysis
218
+ - FDR assessment
219
+ - Enzyme specificity
220
+
221
+ ### .protXML - Protein XML
222
+ **Description:** TPP protein inference format
223
+ **Typical Data:** Protein identifications
224
+ **Use Cases:** Proteomics protein-level results
225
+ **Python Libraries:**
226
+ - `pyteomics.protxml`
227
+ **EDA Approach:**
228
+ - Protein group analysis
229
+ - Coverage statistics
230
+ - Confidence scoring
231
+ - Parsimony analysis
232
+
233
+ ### .msp - NIST MS Search Format
234
+ **Description:** NIST spectral library format
235
+ **Typical Data:** Reference mass spectra
236
+ **Use Cases:** Spectral library searching
237
+ **Python Libraries:**
238
+ - `matchms`: Spectral library handling
239
+ - Custom parsers
240
+ **EDA Approach:**
241
+ - Library size and coverage
242
+ - Metadata completeness
243
+ - Peak count statistics
244
+ - Compound annotation quality
245
+
246
+ ## Infrared and Raman Spectroscopy
247
+
248
+ ### .spc - Galactic SPC
249
+ **Description:** Thermo Galactic spectroscopy format
250
+ **Typical Data:** IR, Raman, UV-Vis spectra
251
+ **Use Cases:** Various spectroscopy instruments
252
+ **Python Libraries:**
253
+ - `spc`: `spc.File('file.spc')`
254
+ - `specio`: Multi-format reader
255
+ **EDA Approach:**
256
+ - Wavenumber/wavelength range
257
+ - Data point density
258
+ - Multi-spectrum handling
259
+ - Baseline characteristics
260
+ - Peak identification
261
+ - Absorbance/transmittance mode
262
+ - Instrument information
263
+
264
+ ### .spa - Thermo Nicolet
265
+ **Description:** Thermo Fisher FTIR format
266
+ **Typical Data:** FTIR spectra
267
+ **Use Cases:** OMNIC software data
268
+ **Python Libraries:**
269
+ - Custom binary parsers
270
+ - Conversion to JCAMP or SPC
271
+ **EDA Approach:**
272
+ - Interferogram vs spectrum
273
+ - Background spectrum validation
274
+ - Atmospheric compensation
275
+ - Resolution and scan number
276
+ - Sample information
277
+
278
+ ### .0 - Bruker OPUS
279
+ **Description:** Bruker OPUS FTIR format (numbered files)
280
+ **Typical Data:** FTIR spectra and metadata
281
+ **Use Cases:** Bruker FTIR instruments
282
+ **Python Libraries:**
283
+ - `brukeropusreader`: OPUS format parser
284
+ - `specio`: OPUS support
285
+ **EDA Approach:**
286
+ - Multiple block types (AB, ScSm, etc.)
287
+ - Sample and reference spectra
288
+ - Instrument parameters
289
+ - Optical path configuration
290
+ - Beam splitter and detector info
291
+
292
+ ### .dpt - Data Point Table
293
+ **Description:** Simple XY data format
294
+ **Typical Data:** Generic spectroscopic data
295
+ **Use Cases:** Renishaw Raman, generic exports
296
+ **Python Libraries:**
297
+ - `pandas`: CSV-like reading
298
+ - Text parsing
299
+ **EDA Approach:**
300
+ - X-axis type (wavelength, wavenumber, Raman shift)
301
+ - Y-axis units (intensity, absorbance, etc.)
302
+ - Data point spacing
303
+ - Header information
304
+ - Multi-column data handling
305
+
306
+ ### .wdf - Renishaw Raman
307
+ **Description:** Renishaw WiRE data format
308
+ **Typical Data:** Raman spectra and maps
309
+ **Use Cases:** Renishaw Raman microscopy
310
+ **Python Libraries:**
311
+ - `renishawWiRE`: WDF reader
312
+ - Custom parsers for WDF format
313
+ **EDA Approach:**
314
+ - Spectral vs mapping data
315
+ - Laser wavelength
316
+ - Accumulation and exposure time
317
+ - Spatial coordinates (mapping)
318
+ - Z-scan data
319
+ - Baseline and cosmic ray correction
320
+
321
+ ### .txt (Spectroscopy)
322
+ **Description:** Generic text export from instruments
323
+ **Typical Data:** Wavelength/wavenumber and intensity
324
+ **Use Cases:** Universal data exchange
325
+ **Python Libraries:**
326
+ - `pandas`: Text file reading
327
+ - `numpy`: Simple array loading
328
+ **EDA Approach:**
329
+ - Delimiter and format detection
330
+ - Header parsing
331
+ - Units identification
332
+ - Multiple spectrum handling
333
+ - Metadata extraction from comments
334
+
335
+ ## UV-Visible Spectroscopy
336
+
337
+ ### .asd / .asc - ASD Binary/ASCII
338
+ **Description:** ASD FieldSpec spectroradiometer
339
+ **Typical Data:** Hyperspectral UV-Vis-NIR data
340
+ **Use Cases:** Remote sensing, reflectance spectroscopy
341
+ **Python Libraries:**
342
+ - `spectral.io.asd`: ASD format support
343
+ - Custom parsers
344
+ **EDA Approach:**
345
+ - Wavelength range (UV to NIR)
346
+ - Reference spectrum validation
347
+ - Dark current correction
348
+ - Integration time
349
+ - GPS metadata (if present)
350
+ - Reflectance vs radiance
351
+
352
+ ### .sp - Perkin Elmer
353
+ **Description:** Perkin Elmer UV/Vis format
354
+ **Typical Data:** UV-Vis spectrophotometer data
355
+ **Use Cases:** PE Lambda instruments
356
+ **Python Libraries:**
357
+ - Custom parsers
358
+ - Conversion to standard formats
359
+ **EDA Approach:**
360
+ - Scan parameters
361
+ - Baseline correction
362
+ - Multi-wavelength scans
363
+ - Time-based measurements
364
+ - Sample/reference handling
365
+
366
+ ### .csv (Spectroscopy)
367
+ **Description:** CSV export from UV-Vis instruments
368
+ **Typical Data:** Wavelength and absorbance/transmittance
369
+ **Use Cases:** Universal format for UV-Vis data
370
+ **Python Libraries:**
371
+ - `pandas`: Native CSV support
372
+ **EDA Approach:**
373
+ - Lambda max identification
374
+ - Beer's law compliance
375
+ - Baseline offset
376
+ - Path length correction
377
+ - Concentration calculations
378
+
379
+ ## X-ray and Diffraction
380
+
381
+ ### .cif - Crystallographic Information File
382
+ **Description:** Crystal structure and diffraction data
383
+ **Typical Data:** Unit cell, atomic positions, structure factors
384
+ **Use Cases:** Crystallography, materials science
385
+ **Python Libraries:**
386
+ - `gemmi`: `gemmi.cif.read_file('file.cif')`
387
+ - `PyCifRW`: CIF reading/writing
388
+ - `pymatgen`: Materials structure analysis
389
+ **EDA Approach:**
390
+ - Crystal system and space group
391
+ - Unit cell parameters
392
+ - Atomic positions and occupancy
393
+ - Thermal parameters
394
+ - R-factors and refinement quality
395
+ - Completeness and redundancy
396
+ - Structure validation
397
+
398
+ ### .hkl - Reflection Data
399
+ **Description:** Miller indices and intensities
400
+ **Typical Data:** Integrated diffraction intensities
401
+ **Use Cases:** Crystallographic refinement
402
+ **Python Libraries:**
403
+ - Custom parsers (format dependent)
404
+ - Crystallography packages (CCP4, etc.)
405
+ **EDA Approach:**
406
+ - Resolution range
407
+ - Completeness by shell
408
+ - I/sigma distribution
409
+ - Systematic absences
410
+ - Twinning detection
411
+ - Wilson plot
412
+
413
+ ### .mtz - MTZ Format (CCP4)
414
+ **Description:** Binary crystallographic data
415
+ **Typical Data:** Reflections, phases, structure factors
416
+ **Use Cases:** Macromolecular crystallography
417
+ **Python Libraries:**
418
+ - `gemmi`: MTZ support
419
+ - `cctbx`: Comprehensive crystallography
420
+ **EDA Approach:**
421
+ - Column types and data
422
+ - Resolution limits
423
+ - R-factors (Rwork, Rfree)
424
+ - Phase probability distribution
425
+ - Map coefficients
426
+ - Batch information
427
+
428
+ ### .xy / .xye - Powder Diffraction
429
+ **Description:** 2-theta vs intensity data
430
+ **Typical Data:** Powder X-ray diffraction patterns
431
+ **Use Cases:** Phase identification, Rietveld refinement
432
+ **Python Libraries:**
433
+ - `pandas`: Simple XY reading
434
+ - `pymatgen`: XRD pattern analysis
435
+ **EDA Approach:**
436
+ - 2-theta range
437
+ - Peak positions and intensities
438
+ - Background modeling
439
+ - Peak width analysis (strain/size)
440
+ - Phase identification via matching
441
+ - Preferred orientation effects
442
+
443
+ ### .raw (XRD)
444
+ **Description:** Vendor-specific XRD raw data
445
+ **Typical Data:** XRD patterns with metadata
446
+ **Use Cases:** Bruker, PANalytical, Rigaku instruments
447
+ **Python Libraries:**
448
+ - Vendor-specific parsers
449
+ - Conversion tools
450
+ **EDA Approach:**
451
+ - Scan parameters (step size, time)
452
+ - Sample alignment
453
+ - Incident beam setup
454
+ - Detector configuration
455
+ - Background scan validation
456
+
457
+ ### .gsa / .gsas - GSAS Format
458
+ **Description:** General Structure Analysis System
459
+ **Typical Data:** Powder diffraction for Rietveld
460
+ **Use Cases:** Rietveld refinement
461
+ **Python Libraries:**
462
+ - GSAS-II Python interface
463
+ - Custom parsers
464
+ **EDA Approach:**
465
+ - Histogram data
466
+ - Instrument parameters
467
+ - Phase information
468
+ - Refinement constraints
469
+ - Profile function parameters
470
+
471
+ ## Electron Spectroscopy
472
+
473
+ ### .vms - VG Scienta
474
+ **Description:** VG Scienta spectrometer format
475
+ **Typical Data:** XPS, UPS, ARPES spectra
476
+ **Use Cases:** Photoelectron spectroscopy
477
+ **Python Libraries:**
478
+ - Custom parsers for VMS
479
+ - `specio`: Multi-format support
480
+ **EDA Approach:**
481
+ - Binding energy calibration
482
+ - Pass energy and resolution
483
+ - Photoelectron line identification
484
+ - Satellite peak analysis
485
+ - Background subtraction quality
486
+ - Fermi edge position
487
+
488
+ ### .spe - WinSpec/SPE Format
489
+ **Description:** Princeton Instruments/Roper Scientific
490
+ **Typical Data:** CCD spectra, Raman, PL
491
+ **Use Cases:** Spectroscopy with CCD detectors
492
+ **Python Libraries:**
493
+ - `spe2py`: SPE file reader
494
+ - `spe_loader`: Alternative parser
495
+ **EDA Approach:**
496
+ - CCD frame analysis
497
+ - Wavelength calibration
498
+ - Dark frame subtraction
499
+ - Cosmic ray identification
500
+ - Readout noise
501
+ - Accumulation statistics
502
+
503
+ ### .pxt - Princeton PTI
504
+ **Description:** Photon Technology International
505
+ **Typical Data:** Fluorescence, phosphorescence spectra
506
+ **Use Cases:** Fluorescence spectroscopy
507
+ **Python Libraries:**
508
+ - Custom parsers
509
+ - Text-based format variants
510
+ **EDA Approach:**
511
+ - Excitation and emission spectra
512
+ - Quantum yield calculations
513
+ - Time-resolved measurements
514
+ - Temperature-dependent data
515
+ - Correction factors applied
516
+
517
+ ### .dat (Spectroscopy Generic)
518
+ **Description:** Generic binary or text spectroscopy data
519
+ **Typical Data:** Various spectroscopic measurements
520
+ **Use Cases:** Many instruments use .dat extension
521
+ **Python Libraries:**
522
+ - Format-specific identification needed
523
+ - `numpy`, `pandas` for known formats
524
+ **EDA Approach:**
525
+ - Format detection (binary vs text)
526
+ - Header identification
527
+ - Data structure inference
528
+ - Units and axis labels
529
+ - Instrument signature detection
530
+
531
+ ## Chromatography
532
+
533
+ ### .chrom - Chromatogram Data
534
+ **Description:** Generic chromatography format
535
+ **Typical Data:** Retention time vs signal
536
+ **Use Cases:** HPLC, GC, LC-MS
537
+ **Python Libraries:**
538
+ - Vendor-specific parsers
539
+ - `pandas` for text exports
540
+ **EDA Approach:**
541
+ - Retention time range
542
+ - Peak detection and integration
543
+ - Baseline drift
544
+ - Resolution between peaks
545
+ - Signal-to-noise ratio
546
+ - Tailing factor
547
+
548
+ ### .ch - ChemStation
549
+ **Description:** Agilent ChemStation format
550
+ **Typical Data:** Chromatograms and method parameters
551
+ **Use Cases:** Agilent HPLC and GC systems
552
+ **Python Libraries:**
553
+ - `agilent-chemstation`: Community tools
554
+ - Binary format parsers
555
+ **EDA Approach:**
556
+ - Method validation
557
+ - Integration parameters
558
+ - Calibration curve
559
+ - Sample sequence information
560
+ - Instrument status
561
+
562
+ ### .arw - Empower (Waters)
563
+ **Description:** Waters Empower format
564
+ **Typical Data:** UPLC/HPLC chromatograms
565
+ **Use Cases:** Waters instrument data
566
+ **Python Libraries:**
567
+ - Vendor tools (limited Python access)
568
+ - Database extraction tools
569
+ **EDA Approach:**
570
+ - Audit trail information
571
+ - Processing methods
572
+ - Compound identification
573
+ - Quantitation results
574
+ - System suitability tests
575
+
576
+ ### .lcd - Shimadzu LabSolutions
577
+ **Description:** Shimadzu chromatography format
578
+ **Typical Data:** GC/HPLC data
579
+ **Use Cases:** Shimadzu instruments
580
+ **Python Libraries:**
581
+ - Vendor-specific parsers
582
+ **EDA Approach:**
583
+ - Method parameters
584
+ - Peak purity analysis
585
+ - Spectral data (if PDA)
586
+ - Quantitative results
587
+
588
+ ## Other Analytical Techniques
589
+
590
+ ### .dta - DSC/TGA Data
591
+ **Description:** Thermal analysis data (TA Instruments)
592
+ **Typical Data:** Temperature vs heat flow or mass
593
+ **Use Cases:** Differential scanning calorimetry, thermogravimetry
594
+ **Python Libraries:**
595
+ - Custom parsers for TA formats
596
+ - `pandas` for exported data
597
+ **EDA Approach:**
598
+ - Transition temperature identification
599
+ - Enthalpy calculations
600
+ - Mass loss steps
601
+ - Heating rate effects
602
+ - Baseline determination
603
+ - Purity assessment
604
+
605
+ ### .run - ICP-MS/ICP-OES
606
+ **Description:** Elemental analysis data
607
+ **Typical Data:** Element concentrations or counts
608
+ **Use Cases:** Inductively coupled plasma MS/OES
609
+ **Python Libraries:**
610
+ - Vendor-specific tools
611
+ - Custom parsers
612
+ **EDA Approach:**
613
+ - Element detection and quantitation
614
+ - Internal standard performance
615
+ - Spike recovery
616
+ - Dilution factor corrections
617
+ - Isotope ratios
618
+ - LOD/LOQ calculations
619
+
620
+ ### .exp - Electrochemistry Data
621
+ **Description:** Electrochemical experiment data
622
+ **Typical Data:** Potential vs current or charge
623
+ **Use Cases:** Cyclic voltammetry, chronoamperometry
624
+ **Python Libraries:**
625
+ - Custom parsers per instrument (CHI, Gamry, etc.)
626
+ - `galvani`: Biologic EC-Lab files
627
+ **EDA Approach:**
628
+ - Redox peak identification
629
+ - Peak potential and current
630
+ - Scan rate effects
631
+ - Electron transfer kinetics
632
+ - Background subtraction
633
+ - Capacitance calculations
.scider/skills/exploratory-data-analysis/scripts/eda_analyzer.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Exploratory Data Analysis Analyzer
4
+ Analyzes scientific data files and generates comprehensive markdown reports
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import sys
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+
13
+
14
+ def detect_file_type(filepath):
15
+ """
16
+ Detect the file type based on extension and content.
17
+
18
+ Returns:
19
+ tuple: (extension, file_category, reference_file)
20
+ """
21
+ file_path = Path(filepath)
22
+ extension = file_path.suffix.lower()
23
+ name = file_path.name.lower()
24
+
25
+ # Map extensions to categories and reference files
26
+ extension_map = {
27
+ # Chemistry/Molecular
28
+ "pdb": ("chemistry_molecular", "Protein Data Bank"),
29
+ "cif": ("chemistry_molecular", "Crystallographic Information File"),
30
+ "mol": ("chemistry_molecular", "MDL Molfile"),
31
+ "mol2": ("chemistry_molecular", "Tripos Mol2"),
32
+ "sdf": ("chemistry_molecular", "Structure Data File"),
33
+ "xyz": ("chemistry_molecular", "XYZ Coordinates"),
34
+ "smi": ("chemistry_molecular", "SMILES String"),
35
+ "smiles": ("chemistry_molecular", "SMILES String"),
36
+ "pdbqt": ("chemistry_molecular", "AutoDock PDBQT"),
37
+ "mae": ("chemistry_molecular", "Maestro Format"),
38
+ "gro": ("chemistry_molecular", "GROMACS Coordinate File"),
39
+ "log": ("chemistry_molecular", "Gaussian Log File"),
40
+ "out": ("chemistry_molecular", "Quantum Chemistry Output"),
41
+ "wfn": ("chemistry_molecular", "Wavefunction Files"),
42
+ "wfx": ("chemistry_molecular", "Wavefunction Files"),
43
+ "fchk": ("chemistry_molecular", "Gaussian Formatted Checkpoint"),
44
+ "cube": ("chemistry_molecular", "Gaussian Cube File"),
45
+ "dcd": ("chemistry_molecular", "Binary Trajectory"),
46
+ "xtc": ("chemistry_molecular", "Compressed Trajectory"),
47
+ "trr": ("chemistry_molecular", "GROMACS Trajectory"),
48
+ "nc": ("chemistry_molecular", "Amber NetCDF Trajectory"),
49
+ "netcdf": ("chemistry_molecular", "Amber NetCDF Trajectory"),
50
+ # Bioinformatics/Genomics
51
+ "fasta": ("bioinformatics_genomics", "FASTA Format"),
52
+ "fa": ("bioinformatics_genomics", "FASTA Format"),
53
+ "fna": ("bioinformatics_genomics", "FASTA Format"),
54
+ "fastq": ("bioinformatics_genomics", "FASTQ Format"),
55
+ "fq": ("bioinformatics_genomics", "FASTQ Format"),
56
+ "sam": ("bioinformatics_genomics", "Sequence Alignment/Map"),
57
+ "bam": ("bioinformatics_genomics", "Binary Alignment/Map"),
58
+ "cram": ("bioinformatics_genomics", "CRAM Format"),
59
+ "bed": ("bioinformatics_genomics", "Browser Extensible Data"),
60
+ "bedgraph": ("bioinformatics_genomics", "BED with Graph Data"),
61
+ "bigwig": ("bioinformatics_genomics", "Binary BigWig"),
62
+ "bw": ("bioinformatics_genomics", "Binary BigWig"),
63
+ "bigbed": ("bioinformatics_genomics", "Binary BigBed"),
64
+ "bb": ("bioinformatics_genomics", "Binary BigBed"),
65
+ "gff": ("bioinformatics_genomics", "General Feature Format"),
66
+ "gff3": ("bioinformatics_genomics", "General Feature Format"),
67
+ "gtf": ("bioinformatics_genomics", "Gene Transfer Format"),
68
+ "vcf": ("bioinformatics_genomics", "Variant Call Format"),
69
+ "bcf": ("bioinformatics_genomics", "Binary VCF"),
70
+ "gvcf": ("bioinformatics_genomics", "Genomic VCF"),
71
+ # Microscopy/Imaging
72
+ "tif": ("microscopy_imaging", "Tagged Image File Format"),
73
+ "tiff": ("microscopy_imaging", "Tagged Image File Format"),
74
+ "nd2": ("microscopy_imaging", "Nikon NIS-Elements"),
75
+ "lif": ("microscopy_imaging", "Leica Image Format"),
76
+ "czi": ("microscopy_imaging", "Carl Zeiss Image"),
77
+ "oib": ("microscopy_imaging", "Olympus Image Format"),
78
+ "oif": ("microscopy_imaging", "Olympus Image Format"),
79
+ "vsi": ("microscopy_imaging", "Olympus VSI"),
80
+ "ims": ("microscopy_imaging", "Imaris Format"),
81
+ "lsm": ("microscopy_imaging", "Zeiss LSM"),
82
+ "stk": ("microscopy_imaging", "MetaMorph Stack"),
83
+ "dv": ("microscopy_imaging", "DeltaVision"),
84
+ "mrc": ("microscopy_imaging", "Medical Research Council"),
85
+ "dm3": ("microscopy_imaging", "Gatan Digital Micrograph"),
86
+ "dm4": ("microscopy_imaging", "Gatan Digital Micrograph"),
87
+ "dcm": ("microscopy_imaging", "DICOM"),
88
+ "nii": ("microscopy_imaging", "NIfTI"),
89
+ "nrrd": ("microscopy_imaging", "Nearly Raw Raster Data"),
90
+ # Spectroscopy/Analytical
91
+ "fid": ("spectroscopy_analytical", "NMR Free Induction Decay"),
92
+ "mzml": ("spectroscopy_analytical", "Mass Spectrometry Markup Language"),
93
+ "mzxml": ("spectroscopy_analytical", "Mass Spectrometry XML"),
94
+ "raw": ("spectroscopy_analytical", "Vendor Raw Files"),
95
+ "d": ("spectroscopy_analytical", "Agilent Data Directory"),
96
+ "mgf": ("spectroscopy_analytical", "Mascot Generic Format"),
97
+ "spc": ("spectroscopy_analytical", "Galactic SPC"),
98
+ "jdx": ("spectroscopy_analytical", "JCAMP-DX"),
99
+ "jcamp": ("spectroscopy_analytical", "JCAMP-DX"),
100
+ # Proteomics/Metabolomics
101
+ "pepxml": ("proteomics_metabolomics", "Trans-Proteomic Pipeline Peptide XML"),
102
+ "protxml": ("proteomics_metabolomics", "Protein Inference Results"),
103
+ "mzid": ("proteomics_metabolomics", "Peptide Identification Format"),
104
+ "mztab": ("proteomics_metabolomics", "Proteomics/Metabolomics Tabular Format"),
105
+ # General Scientific
106
+ "npy": ("general_scientific", "NumPy Array"),
107
+ "npz": ("general_scientific", "Compressed NumPy Archive"),
108
+ "csv": ("general_scientific", "Comma-Separated Values"),
109
+ "tsv": ("general_scientific", "Tab-Separated Values"),
110
+ "xlsx": ("general_scientific", "Excel Spreadsheets"),
111
+ "xls": ("general_scientific", "Excel Spreadsheets"),
112
+ "json": ("general_scientific", "JavaScript Object Notation"),
113
+ "xml": ("general_scientific", "Extensible Markup Language"),
114
+ "hdf5": ("general_scientific", "Hierarchical Data Format 5"),
115
+ "h5": ("general_scientific", "Hierarchical Data Format 5"),
116
+ "h5ad": ("bioinformatics_genomics", "Anndata Format"),
117
+ "zarr": ("general_scientific", "Chunked Array Storage"),
118
+ "parquet": ("general_scientific", "Apache Parquet"),
119
+ "mat": ("general_scientific", "MATLAB Data"),
120
+ "fits": ("general_scientific", "Flexible Image Transport System"),
121
+ }
122
+
123
+ ext_clean = extension.lstrip(".")
124
+ if ext_clean in extension_map:
125
+ category, description = extension_map[ext_clean]
126
+ return ext_clean, category, description
127
+
128
+ return ext_clean, "unknown", "Unknown Format"
129
+
130
+
131
+ def get_file_basic_info(filepath):
132
+ """Get basic file information."""
133
+ file_path = Path(filepath)
134
+ stat = file_path.stat()
135
+
136
+ return {
137
+ "filename": file_path.name,
138
+ "path": str(file_path.absolute()),
139
+ "size_bytes": stat.st_size,
140
+ "size_human": format_bytes(stat.st_size),
141
+ "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
142
+ "extension": file_path.suffix.lower(),
143
+ }
144
+
145
+
146
+ def format_bytes(size):
147
+ """Convert bytes to human-readable format."""
148
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
149
+ if size < 1024.0:
150
+ return f"{size:.2f} {unit}"
151
+ size /= 1024.0
152
+ return f"{size:.2f} PB"
153
+
154
+
155
+ def load_reference_info(category, extension):
156
+ """
157
+ Load reference information for the file type.
158
+
159
+ Args:
160
+ category: File category (e.g., 'chemistry_molecular')
161
+ extension: File extension
162
+
163
+ Returns:
164
+ dict: Reference information
165
+ """
166
+ # Map categories to reference files
167
+ category_files = {
168
+ "chemistry_molecular": "chemistry_molecular_formats.md",
169
+ "bioinformatics_genomics": "bioinformatics_genomics_formats.md",
170
+ "microscopy_imaging": "microscopy_imaging_formats.md",
171
+ "spectroscopy_analytical": "spectroscopy_analytical_formats.md",
172
+ "proteomics_metabolomics": "proteomics_metabolomics_formats.md",
173
+ "general_scientific": "general_scientific_formats.md",
174
+ }
175
+
176
+ if category not in category_files:
177
+ return None
178
+
179
+ # Get the reference file path
180
+ script_dir = Path(__file__).parent
181
+ ref_file = script_dir.parent / "references" / category_files[category]
182
+
183
+ if not ref_file.exists():
184
+ return None
185
+
186
+ # Parse the reference file for the specific extension
187
+ # This is a simplified parser - could be more sophisticated
188
+ try:
189
+ with open(ref_file, "r") as f:
190
+ content = f.read()
191
+
192
+ # Extract section for this file type
193
+ # Look for the extension heading
194
+ import re
195
+
196
+ pattern = rf"### \.{extension}[^#]*?(?=###|\Z)"
197
+ match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
198
+
199
+ if match:
200
+ section = match.group(0)
201
+ return {"raw_section": section, "reference_file": category_files[category]}
202
+ except Exception as e:
203
+ print(f"Error loading reference: {e}", file=sys.stderr)
204
+
205
+ return None
206
+
207
+
208
+ def analyze_file(filepath):
209
+ """
210
+ Main analysis function that routes to specific analyzers.
211
+
212
+ Returns:
213
+ dict: Analysis results
214
+ """
215
+ basic_info = get_file_basic_info(filepath)
216
+ extension, category, description = detect_file_type(filepath)
217
+
218
+ analysis = {
219
+ "basic_info": basic_info,
220
+ "file_type": {"extension": extension, "category": category, "description": description},
221
+ "reference_info": load_reference_info(category, extension),
222
+ "data_analysis": {},
223
+ }
224
+
225
+ # Try to perform data-specific analysis based on file type
226
+ try:
227
+ if category == "general_scientific":
228
+ analysis["data_analysis"] = analyze_general_scientific(filepath, extension)
229
+ elif category == "bioinformatics_genomics":
230
+ analysis["data_analysis"] = analyze_bioinformatics(filepath, extension)
231
+ elif category == "microscopy_imaging":
232
+ analysis["data_analysis"] = analyze_imaging(filepath, extension)
233
+ # Add more specific analyzers as needed
234
+ except Exception as e:
235
+ analysis["data_analysis"]["error"] = str(e)
236
+
237
+ return analysis
238
+
239
+
240
+ def analyze_general_scientific(filepath, extension):
241
+ """Analyze general scientific data formats."""
242
+ results = {}
243
+
244
+ try:
245
+ if extension in ["npy"]:
246
+ import numpy as np
247
+
248
+ data = np.load(filepath)
249
+ results = {
250
+ "shape": data.shape,
251
+ "dtype": str(data.dtype),
252
+ "size": data.size,
253
+ "ndim": data.ndim,
254
+ "statistics": {
255
+ "min": float(np.min(data)) if np.issubdtype(data.dtype, np.number) else None,
256
+ "max": float(np.max(data)) if np.issubdtype(data.dtype, np.number) else None,
257
+ "mean": float(np.mean(data)) if np.issubdtype(data.dtype, np.number) else None,
258
+ "std": float(np.std(data)) if np.issubdtype(data.dtype, np.number) else None,
259
+ },
260
+ }
261
+
262
+ elif extension in ["npz"]:
263
+ import numpy as np
264
+
265
+ data = np.load(filepath)
266
+ results = {
267
+ "arrays": list(data.files),
268
+ "array_count": len(data.files),
269
+ "array_shapes": {name: data[name].shape for name in data.files},
270
+ }
271
+
272
+ elif extension in ["csv", "tsv"]:
273
+ import pandas as pd
274
+
275
+ sep = "\t" if extension == "tsv" else ","
276
+ df = pd.read_csv(filepath, sep=sep, nrows=10000) # Sample first 10k rows
277
+
278
+ results = {
279
+ "shape": df.shape,
280
+ "columns": list(df.columns),
281
+ "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
282
+ "missing_values": df.isnull().sum().to_dict(),
283
+ "summary_statistics": (
284
+ df.describe().to_dict()
285
+ if len(df.select_dtypes(include="number").columns) > 0
286
+ else {}
287
+ ),
288
+ }
289
+
290
+ elif extension in ["json"]:
291
+ with open(filepath, "r") as f:
292
+ data = json.load(f)
293
+
294
+ results = {
295
+ "type": type(data).__name__,
296
+ "keys": list(data.keys()) if isinstance(data, dict) else None,
297
+ "length": len(data) if isinstance(data, (list, dict)) else None,
298
+ }
299
+
300
+ elif extension in ["h5", "hdf5"]:
301
+ import h5py
302
+
303
+ with h5py.File(filepath, "r") as f:
304
+
305
+ def get_structure(group, prefix=""):
306
+ items = {}
307
+ for key in group.keys():
308
+ path = f"{prefix}/{key}"
309
+ if isinstance(group[key], h5py.Dataset):
310
+ items[path] = {
311
+ "type": "dataset",
312
+ "shape": group[key].shape,
313
+ "dtype": str(group[key].dtype),
314
+ }
315
+ elif isinstance(group[key], h5py.Group):
316
+ items[path] = {"type": "group"}
317
+ items.update(get_structure(group[key], path))
318
+ return items
319
+
320
+ results = {"structure": get_structure(f), "attributes": dict(f.attrs)}
321
+
322
+ except ImportError as e:
323
+ results["error"] = f"Required library not installed: {e}"
324
+ except Exception as e:
325
+ results["error"] = f"Analysis error: {e}"
326
+
327
+ return results
328
+
329
+
330
+ def analyze_bioinformatics(filepath, extension):
331
+ """Analyze bioinformatics/genomics formats."""
332
+ results = {}
333
+
334
+ try:
335
+ if extension in ["fasta", "fa", "fna"]:
336
+ from Bio import SeqIO
337
+
338
+ sequences = list(SeqIO.parse(filepath, "fasta"))
339
+ lengths = [len(seq) for seq in sequences]
340
+
341
+ results = {
342
+ "sequence_count": len(sequences),
343
+ "total_length": sum(lengths),
344
+ "mean_length": sum(lengths) / len(lengths) if lengths else 0,
345
+ "min_length": min(lengths) if lengths else 0,
346
+ "max_length": max(lengths) if lengths else 0,
347
+ "sequence_ids": [seq.id for seq in sequences[:10]], # First 10
348
+ }
349
+
350
+ elif extension in ["fastq", "fq"]:
351
+ from Bio import SeqIO
352
+
353
+ sequences = []
354
+ for i, seq in enumerate(SeqIO.parse(filepath, "fastq")):
355
+ sequences.append(seq)
356
+ if i >= 9999: # Sample first 10k
357
+ break
358
+
359
+ lengths = [len(seq) for seq in sequences]
360
+ qualities = [
361
+ sum(seq.letter_annotations["phred_quality"]) / len(seq) for seq in sequences
362
+ ]
363
+
364
+ results = {
365
+ "read_count_sampled": len(sequences),
366
+ "mean_length": sum(lengths) / len(lengths) if lengths else 0,
367
+ "mean_quality": sum(qualities) / len(qualities) if qualities else 0,
368
+ "min_length": min(lengths) if lengths else 0,
369
+ "max_length": max(lengths) if lengths else 0,
370
+ }
371
+
372
+ except ImportError as e:
373
+ results["error"] = f"Required library not installed (try: pip install biopython): {e}"
374
+ except Exception as e:
375
+ results["error"] = f"Analysis error: {e}"
376
+
377
+ return results
378
+
379
+
380
+ def analyze_imaging(filepath, extension):
381
+ """Analyze microscopy/imaging formats."""
382
+ results = {}
383
+
384
+ try:
385
+ if extension in ["tif", "tiff", "png", "jpg", "jpeg"]:
386
+ import numpy as np
387
+ from PIL import Image
388
+
389
+ img = Image.open(filepath)
390
+ img_array = np.array(img)
391
+
392
+ results = {
393
+ "size": img.size,
394
+ "mode": img.mode,
395
+ "format": img.format,
396
+ "shape": img_array.shape,
397
+ "dtype": str(img_array.dtype),
398
+ "value_range": [int(img_array.min()), int(img_array.max())],
399
+ "mean_intensity": float(img_array.mean()),
400
+ }
401
+
402
+ # Check for multi-page TIFF
403
+ if extension in ["tif", "tiff"]:
404
+ try:
405
+ frame_count = 0
406
+ while True:
407
+ img.seek(frame_count)
408
+ frame_count += 1
409
+ except EOFError:
410
+ results["page_count"] = frame_count
411
+
412
+ except ImportError as e:
413
+ results["error"] = f"Required library not installed (try: pip install pillow): {e}"
414
+ except Exception as e:
415
+ results["error"] = f"Analysis error: {e}"
416
+
417
+ return results
418
+
419
+
420
+ def generate_markdown_report(analysis, output_path=None):
421
+ """
422
+ Generate a comprehensive markdown report from analysis results.
423
+
424
+ Args:
425
+ analysis: Analysis results dictionary
426
+ output_path: Path to save the report (if None, prints to stdout)
427
+ """
428
+ lines = []
429
+
430
+ # Title
431
+ filename = analysis["basic_info"]["filename"]
432
+ lines.append(f"# Exploratory Data Analysis Report: {filename}\n")
433
+ lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
434
+ lines.append("---\n")
435
+
436
+ # Basic Information
437
+ lines.append("## Basic Information\n")
438
+ basic = analysis["basic_info"]
439
+ lines.append(f"- **Filename:** `{basic['filename']}`")
440
+ lines.append(f"- **Full Path:** `{basic['path']}`")
441
+ lines.append(f"- **File Size:** {basic['size_human']} ({basic['size_bytes']:,} bytes)")
442
+ lines.append(f"- **Last Modified:** {basic['modified']}")
443
+ lines.append(f"- **Extension:** `.{analysis['file_type']['extension']}`\n")
444
+
445
+ # File Type Information
446
+ lines.append("## File Type\n")
447
+ ft = analysis["file_type"]
448
+ lines.append(f"- **Category:** {ft['category'].replace('_', ' ').title()}")
449
+ lines.append(f"- **Description:** {ft['description']}\n")
450
+
451
+ # Reference Information
452
+ if analysis.get("reference_info"):
453
+ lines.append("## Format Reference\n")
454
+ ref = analysis["reference_info"]
455
+ if "raw_section" in ref:
456
+ lines.append(ref["raw_section"])
457
+ lines.append(f"\n*Reference: {ref['reference_file']}*\n")
458
+
459
+ # Data Analysis
460
+ if analysis.get("data_analysis"):
461
+ lines.append("## Data Analysis\n")
462
+ data = analysis["data_analysis"]
463
+
464
+ if "error" in data:
465
+ lines.append(f"⚠️ **Analysis Error:** {data['error']}\n")
466
+ else:
467
+ # Format the data analysis based on what's present
468
+ lines.append("### Summary Statistics\n")
469
+ lines.append("```json")
470
+ lines.append(json.dumps(data, indent=2, default=str))
471
+ lines.append("```\n")
472
+
473
+ # Recommendations
474
+ lines.append("## Recommendations for Further Analysis\n")
475
+ lines.append(
476
+ f"Based on the file type (`.{analysis['file_type']['extension']}`), consider the following analyses:\n"
477
+ )
478
+
479
+ # Add specific recommendations based on category
480
+ category = analysis["file_type"]["category"]
481
+ if category == "general_scientific":
482
+ lines.append("- Statistical distribution analysis")
483
+ lines.append("- Missing value imputation strategies")
484
+ lines.append("- Correlation analysis between variables")
485
+ lines.append("- Outlier detection and handling")
486
+ lines.append("- Dimensionality reduction (PCA, t-SNE)")
487
+ elif category == "bioinformatics_genomics":
488
+ lines.append("- Sequence quality control and filtering")
489
+ lines.append("- GC content analysis")
490
+ lines.append("- Read alignment and mapping statistics")
491
+ lines.append("- Variant calling and annotation")
492
+ lines.append("- Differential expression analysis")
493
+ elif category == "microscopy_imaging":
494
+ lines.append("- Image quality assessment")
495
+ lines.append("- Background correction and normalization")
496
+ lines.append("- Segmentation and object detection")
497
+ lines.append("- Colocalization analysis")
498
+ lines.append("- Intensity measurements and quantification")
499
+
500
+ lines.append("")
501
+
502
+ # Footer
503
+ lines.append("---")
504
+ lines.append("*This report was generated by the exploratory-data-analysis skill.*")
505
+
506
+ report = "\n".join(lines)
507
+
508
+ if output_path:
509
+ with open(output_path, "w") as f:
510
+ f.write(report)
511
+ print(f"Report saved to: {output_path}")
512
+ else:
513
+ print(report)
514
+
515
+ return report
516
+
517
+
518
+ def main():
519
+ """Main CLI interface."""
520
+ if len(sys.argv) < 2:
521
+ print("Usage: python eda_analyzer.py <filepath> [output.md]")
522
+ print(" filepath: Path to the data file to analyze")
523
+ print(" output.md: Optional output path for markdown report")
524
+ sys.exit(1)
525
+
526
+ filepath = sys.argv[1]
527
+ output_path = sys.argv[2] if len(sys.argv) > 2 else None
528
+
529
+ if not os.path.exists(filepath):
530
+ print(f"Error: File not found: {filepath}")
531
+ sys.exit(1)
532
+
533
+ # If no output path specified, use the input filename
534
+ if output_path is None:
535
+ input_path = Path(filepath)
536
+ output_path = input_path.parent / f"{input_path.stem}_eda_report.md"
537
+
538
+ print(f"Analyzing: {filepath}")
539
+ analysis = analyze_file(filepath)
540
+
541
+ print(f"\nGenerating report...")
542
+ generate_markdown_report(analysis, output_path)
543
+
544
+ print(f"\n✓ Analysis complete!")
545
+
546
+
547
+ if __name__ == "__main__":
548
+ main()
.scider/skills/literature-review-agent/SKILL.md ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: literature-review-agent
3
+ description: Step 3 of the PaperOrchestra pipeline (arXiv:2604.05018). Execute the literature search strategy from outline.json — discover candidate papers via web search, verify them through Semantic Scholar (Levenshtein > 70 fuzzy title match, temporal cutoff, dedup by paperId), build a BibTeX file, and draft Introduction + Related Work using ≥90% of the verified pool. Runs in parallel with the plotting-agent. TRIGGER when the orchestrator delegates Step 3 or when the user asks to "find citations for my paper", "draft the related work", or "build the bibliography".
4
+ allowed_agents: [writing]
5
+ ---
6
+
7
+ # Literature Review Agent (Step 3)
8
+
9
+ Faithful implementation of the Hybrid Literature Agent from PaperOrchestra
10
+ (Song et al., 2026, arXiv:2604.05018, §4 Step 3, App. D.3, App. F.1 p.46).
11
+
12
+ **Cost: ~20–30 LLM calls.** This is one of the two longest steps (the other is
13
+ plotting). Wall-time floor is set by Semantic Scholar's 1 QPS verification
14
+ limit.
15
+
16
+ ## Inputs
17
+
18
+ - `workspace/outline.json` — specifically `intro_related_work_plan` with the
19
+ Introduction search directions and the 2-4 Related Work methodology
20
+ clusters
21
+ - `workspace/inputs/conference_guidelines.md` — used to derive `cutoff_date`
22
+ - `workspace/inputs/idea.md`, `workspace/inputs/experimental_log.md` — for
23
+ framing the Intro and grounding the Related Work positioning
24
+
25
+ ## Outputs
26
+
27
+ - `workspace/citation_pool.json` — verified Semantic Scholar metadata for
28
+ every paper that survived verification
29
+ - `workspace/refs.bib` — BibTeX file generated from the verified pool
30
+ - `workspace/drafts/intro_relwork.tex` — drafted Introduction and Related
31
+ Work sections, written into the template, with the rest of the template
32
+ preserved verbatim
33
+
34
+ ## Two-phase pipeline (App. D.3)
35
+
36
+ ```
37
+ PHASE 1 — Parallel Candidate Discovery
38
+ For each search direction in introduction_strategy.search_directions:
39
+ For each limitation_search_query in each related_work cluster:
40
+ - Use the host's web search tool to discover up to ~10 candidate papers.
41
+ - Run up to 10 discovery queries in parallel (host-permitting).
42
+ - Collect (title, snippet, url) tuples — no verification yet.
43
+ → PRE-DEDUP before Phase 2 (see Step 1.5 below)
44
+
45
+ PHASE 2 — Sequential Citation Verification (1 QPS, with cache)
46
+ For each candidate (after pre-dedup), sequentially:
47
+ 0. Check s2_cache.json first (scripts/s2_cache.py --check).
48
+ If HIT: use cached response, skip live S2 call. No throttle needed.
49
+ If MISS: proceed with live request below.
50
+ 1. Query Semantic Scholar by title:
51
+ GET https://api.semanticscholar.org/graph/v1/paper/search?query=<title>
52
+ &fields=title,abstract,year,authors,venue,externalIds&limit=5
53
+ (Public endpoint, no key. Throttle to 1 QPS for live requests only.)
54
+ 2. Store the S2 response in cache: s2_cache.py --store.
55
+ 3. Pick the top hit. Check Levenshtein title ratio against the original
56
+ candidate title. If ratio < 70: discard.
57
+ 4. Bonus: if year and venue exactly align with hints, add a +5 point
58
+ match-quality bonus.
59
+ 5. Require: abstract is non-empty.
60
+ 6. Require: paper.year (or month if known) strictly predates cutoff_date.
61
+ Months default to day-1: e.g., "October 2024" → 2024-10-01.
62
+ 7. If all checks pass, add to verified pool.
63
+ After all candidates are verified, dedup by Semantic Scholar paperId.
64
+ ```
65
+
66
+ The host agent does the LLM/web work; the deterministic helpers in `scripts/`
67
+ do the math.
68
+
69
+ ## Step-by-step
70
+
71
+ ### 0. Derive `cutoff_date`
72
+
73
+ Parse `conference_guidelines.md` for the submission deadline. The paper aligns
74
+ research cutoff with venue submission deadline (App. D.1):
75
+
76
+ | Venue | Cutoff |
77
+ |---|---|
78
+ | CVPR 2025 | Nov 2024 |
79
+ | ICLR 2025 | Oct 2024 |
80
+ | Other | One month before the stated submission deadline |
81
+
82
+ Encode as `YYYY-MM-DD`. Months default to day-1 (e.g., `2024-10-01`).
83
+
84
+ ### 1. Phase 1: Parallel Candidate Discovery
85
+
86
+ From `outline.json`:
87
+
88
+ - All `introduction_strategy.search_directions` (3-5 queries)
89
+ - For each cluster in `related_work_strategy.subsections`:
90
+ - The cluster's `sota_investigation_mission` becomes a search query
91
+ - All `limitation_search_queries` (1-3 each)
92
+
93
+ For each query, **use your host's web search tool** (e.g., `WebSearch` in
94
+ Claude Code, `@web` in Cursor, the search tool in Antigravity). Collect the
95
+ top ~10 candidates per query: title, abstract snippet, source URL.
96
+
97
+ If your host supports parallel sub-tasks, fire up to 10 concurrent search
98
+ queries. If not, run sequentially — slower but functionally equivalent.
99
+
100
+ #### Optional: Exa as a Phase 1 backend
101
+
102
+ If your host has no native web search, OR you want a research-paper-focused
103
+ backend with better signal-to-noise, you can use [Exa](https://exa.ai) via
104
+ the bundled `scripts/exa_search.py` helper. It is **opt-in** and reads
105
+ `EXA_API_KEY` from the environment — the repo never commits a key.
106
+
107
+ ```bash
108
+ export EXA_API_KEY="your-key-here" # get one at https://dashboard.exa.ai/
109
+ python skills/literature-review-agent/scripts/exa_search.py \
110
+ --query "Sparse attention long context transformers" \
111
+ --num-results 15 \
112
+ --discovered-for "related_work[2.1]"
113
+ ```
114
+
115
+ Output is a normalized candidate list ready to merge into
116
+ `raw_candidates.json`. Phase 2 verification (Semantic Scholar fuzzy match,
117
+ cutoff, dedup) is unchanged. See `references/exa-search-cookbook.md` for
118
+ the full recipe, query patterns, cost estimates, and security notes.
119
+
120
+ Combine all discovered candidates into a single working list. Tag each with
121
+ the originating query ID so you can later attribute it to "intro" vs
122
+ "related_work[i]".
123
+
124
+ ### 1.5. Pre-dedup before Phase 2
125
+
126
+ **Always run this before starting Phase 2.** Multiple search queries routinely
127
+ return the same papers (e.g., "Attention is All You Need" appears in almost
128
+ every NLP discovery query). Verifying duplicates wastes 30-40% of S2 quota
129
+ at 1 QPS.
130
+
131
+ ```bash
132
+ python skills/literature-review-agent/scripts/pre_dedup_candidates.py \
133
+ --in workspace/raw_candidates.json \
134
+ --out workspace/deduped_candidates.json
135
+ # Prints: "150 candidates → 97 unique (53 duplicates removed)"
136
+ ```
137
+
138
+ Use `workspace/deduped_candidates.json` as input to Phase 2.
139
+
140
+ ### 2. Phase 2: Sequential Verification via Semantic Scholar (with cache)
141
+
142
+ For each candidate in `deduped_candidates.json`, in **sequential** order:
143
+
144
+ **Step A — check cache first** (no S2 call, no throttle needed):
145
+ ```bash
146
+ python skills/literature-review-agent/scripts/s2_cache.py \
147
+ --cache workspace/cache/s2_cache.json \
148
+ --check "<candidate title>"
149
+ # exit 0 + prints JSON → use cached response, skip Step B
150
+ # exit 1 → proceed to Step B
151
+ ```
152
+
153
+ **Step B — live S2 request** (cache MISS only, throttle to 1 QPS):
154
+
155
+ **Preferred:** use the bundled `scripts/s2_search.py` helper — it handles
156
+ auth, retries, and 429 back-off automatically:
157
+
158
+ ```bash
159
+ python skills/literature-review-agent/scripts/s2_search.py \
160
+ --query "<URL-decoded candidate title>" --limit 5
161
+ # If SEMANTIC_SCHOLAR_API_KEY is set the key is forwarded automatically.
162
+ # If not, the public unauthenticated endpoint is used (≤1 QPS, still works).
163
+ ```
164
+
165
+ Check whether the key is configured before starting Phase 2:
166
+
167
+ ```bash
168
+ python skills/literature-review-agent/scripts/s2_search.py --check-key
169
+ ```
170
+
171
+ **Fallback:** if you prefer your host's URL fetch tool, GET:
172
+ ```
173
+ https://api.semanticscholar.org/graph/v1/paper/search?query=<URL-encoded title>&limit=5&fields=title,abstract,year,authors,venue,externalIds
174
+ ```
175
+ Add header `x-api-key: <SEMANTIC_SCHOLAR_API_KEY>` if the env var is set.
176
+ Be polite: ≤1 request per second for live requests. Cache hits are free.
177
+
178
+ **Step C — store in cache** (after every successful live request):
179
+ ```bash
180
+ python skills/literature-review-agent/scripts/s2_cache.py \
181
+ --cache workspace/cache/s2_cache.json \
182
+ --store "<candidate title>" \
183
+ --response '<full S2 JSON response>'
184
+ ```
185
+
186
+ For the top hit:
187
+
188
+ ```bash
189
+ python skills/literature-review-agent/scripts/levenshtein_match.py \
190
+ --candidate "Original candidate title" \
191
+ --found "S2 returned title"
192
+ # prints integer 0-100. Discard if < 70.
193
+ ```
194
+
195
+ Then check the temporal cutoff:
196
+
197
+ ```bash
198
+ python skills/literature-review-agent/scripts/check_cutoff.py \
199
+ --paper-year 2024 \
200
+ --paper-month 9 \
201
+ --cutoff 2024-10-01
202
+ # exit 0 if strictly predates, exit 1 if not
203
+ ```
204
+
205
+ If both checks pass AND the abstract is non-empty, append the paper's full
206
+ S2 metadata to the verified pool.
207
+
208
+ ### 3. Dedup and assemble the pool
209
+
210
+ After all candidates are verified:
211
+
212
+ ```bash
213
+ python skills/literature-review-agent/scripts/dedupe_by_id.py \
214
+ --in raw_pool.json \
215
+ --out workspace/citation_pool.json
216
+ ```
217
+
218
+ The dedupe script keys on `paperId` (Semantic Scholar's internal unique ID),
219
+ falling back to `externalIds.DOI`, then `externalIds.ArXiv`, then a
220
+ normalized title.
221
+
222
+ The script also computes and writes `min_cite_paper_count` =
223
+ `floor(0.9 * len(papers))` — the minimum number of papers the writing step
224
+ must cite (the paper's ≥90% integration rule, App. D.3).
225
+
226
+ **Immediately after dedupe_by_id.py**, validate and auto-fix the pool schema:
227
+
228
+ ```bash
229
+ python skills/literature-review-agent/scripts/validate_pool.py \
230
+ --pool workspace/citation_pool.json --fix
231
+ # Catches and fixes authors-as-strings, reports missing required fields.
232
+ # Must pass before proceeding to Step 4.
233
+ ```
234
+
235
+ ### 4. Build the BibTeX file
236
+
237
+ ```bash
238
+ python skills/literature-review-agent/scripts/bibtex_format.py \
239
+ --pool workspace/citation_pool.json \
240
+ --out workspace/refs.bib
241
+ ```
242
+
243
+ The script generates citation keys deterministically from `firstauthor + year
244
+ + first significant word of title` (e.g., `vaswani2017attention`). It writes
245
+ out only `@article` / `@inproceedings` / `@misc` entries — never invents
246
+ fields. It also writes the canonical `bibtex_key` back into each paper record
247
+ in `citation_pool.json`.
248
+
249
+ **Immediately after bibtex_format.py**, sync keys in `intro_relwork.tex`:
250
+
251
+ ```bash
252
+ python skills/literature-review-agent/scripts/sync_keys.py \
253
+ --pool workspace/citation_pool.json \
254
+ --tex workspace/drafts/intro_relwork.tex \
255
+ --inplace
256
+ # Replaces every \cite{agent_key} with \cite{canonical_bibtex_key}.
257
+ # Eliminates citation_coverage gate failures caused by key mismatch.
258
+ ```
259
+
260
+ These two steps replace the manual Python snippets that were previously
261
+ required. The pipeline is now:
262
+
263
+ ```
264
+ dedupe_by_id → validate_pool --fix → bibtex_format → sync_keys
265
+ ```
266
+
267
+ ### 5. Draft Introduction + Related Work
268
+
269
+ This is where you (the host agent) actually write text. Load the
270
+ **verbatim Literature Review Agent prompt** at `references/prompt.md`.
271
+ Substitute the template placeholders:
272
+
273
+ | Placeholder | Value |
274
+ |---|---|
275
+ | `intro_related_work_plan` | full JSON object from `outline.json` |
276
+ | `project_idea` | contents of `idea.md` |
277
+ | `project_experimental_log` | contents of `experimental_log.md` |
278
+ | `citation_checklist` | the BibTeX keys from `refs.bib` |
279
+ | `collected_papers` | list of `{key, title, abstract}` from `citation_pool.json` |
280
+ | `paper_count` | `len(citation_pool.papers)` |
281
+ | `min_cite_paper_count` | from `citation_pool.json` |
282
+ | `cutoff_date` | the date you derived in Step 0 |
283
+
284
+ **Also prepend the Anti-Leakage Prompt** from
285
+ `../paper-orchestra/references/anti-leakage-prompt.md`.
286
+
287
+ Run your LLM with the combined prompt against `template.tex`. The agent's
288
+ job is to fill in the empty Introduction and Related Work sections of the
289
+ template **and leave everything else untouched**. Output: the full
290
+ `template.tex` with those two sections filled. Save to
291
+ `workspace/drafts/intro_relwork.tex`.
292
+
293
+ ### 6. Verify ≥90% citation coverage
294
+
295
+ ```bash
296
+ python skills/literature-review-agent/scripts/citation_coverage.py \
297
+ --tex workspace/drafts/intro_relwork.tex \
298
+ --pool workspace/citation_pool.json
299
+ # exit 0 if ≥90% of pool is cited; exit 1 otherwise
300
+ ```
301
+
302
+ If the gate fails, re-prompt the writing step explicitly listing the missing
303
+ keys and asking the agent to integrate them where contextually appropriate.
304
+
305
+ ## Critical rules from the prompt
306
+
307
+ These are excerpted from `references/prompt.md`. The host agent MUST honor
308
+ them on the writing call:
309
+
310
+ - **Cite ONLY from `collected_papers`.** Never invent BibTeX keys, never
311
+ reference papers not in the pool.
312
+ - **Cite at least `min_cite_paper_count` of them** in Intro + Related Work
313
+ combined.
314
+ - **TIMELINE RULE**: Do not treat any papers published after `cutoff_date`
315
+ as prior baselines to beat. They are concurrent work only.
316
+ - **EVALUATION RULE**: Do not claim our method beats / achieves SOTA over a
317
+ specific cited paper UNLESS that paper is explicitly evaluated against in
318
+ `experimental_log.md`. Frame other recent papers strictly as concurrent,
319
+ orthogonal, or conceptual work.
320
+ - **Output format**: return the full code for the updated `template.tex`,
321
+ with the two empty sections (Introduction and Related Work) filled in,
322
+ and **all the other code** (packages, styles, other sections) **identical
323
+ to the original** template.tex.
324
+ - Wrap output in ```` ```latex ... ``` ```` fences.
325
+ - Do not change `\usepackage[capitalize]{cleveref}` to `cleverref` (there is
326
+ no `cleverref.sty`).
327
+
328
+ ## Degraded mode (no web search)
329
+
330
+ If your host has no web search tool, switch to degraded mode:
331
+
332
+ 1. If the user has placed a pre-built `workspace/inputs/refs.bib` in the
333
+ workspace, load it directly into `workspace/refs.bib` and skip Phase 1
334
+ and Phase 2.
335
+ 2. Otherwise, emit `workspace/drafts/intro_relwork.tex` containing the
336
+ template with two TODO markers in the Intro and Related Work sections,
337
+ and tell the user the pipeline cannot complete Step 3 without web search.
338
+
339
+ ## Resources
340
+
341
+ - `references/prompt.md` — verbatim Literature Review Agent prompt from App. F.1
342
+ - `references/discovery-pipeline.md` — Phase 1 + Phase 2 explained in detail
343
+ - `references/verification-rules.md` — Levenshtein cutoff, year alignment, dedup
344
+ - `references/citation-density-rule.md` — the ≥90% integration rule
345
+ - `references/s2-api-cookbook.md` — Semantic Scholar URLs, fields, rate limits
346
+ - `references/exa-search-cookbook.md` — optional Exa backend for Phase 1 (research-paper-focused web search)
347
+ - `scripts/pre_dedup_candidates.py` — **NEW** dedup Phase 1 candidates before Phase 2 (saves 30-40% S2 quota)
348
+ - `scripts/s2_cache.py` — **NEW** persistent S2 response cache (eliminates re-verification on re-runs)
349
+ - `scripts/validate_pool.py` — **NEW** validate & auto-fix citation_pool.json schema (authors format)
350
+ - `scripts/sync_keys.py` — **NEW** sync cite keys in .tex with canonical bibtex_keys after bibtex_format.py
351
+ - `scripts/levenshtein_match.py` — fuzzy title match (ratio > 70)
352
+ - `scripts/check_cutoff.py` — date cmp w/ month → day-1 default
353
+ - `scripts/dedupe_by_id.py` — dedup verified pool by S2 paperId
354
+ - `scripts/bibtex_format.py` — build refs.bib from JSON pool
355
+ - `scripts/citation_coverage.py` — ≥90% citation coverage gate
356
+ - `scripts/s2_search.py` — **NEW** Semantic Scholar title-search helper; reads `SEMANTIC_SCHOLAR_API_KEY` from env (optional — falls back to unauthenticated)
357
+ - `scripts/exa_search.py` — optional Exa Phase 1 backend (reads `EXA_API_KEY` from env)
.scider/skills/literature-review-agent/references/citation-density-rule.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Citation Density Rule
2
+
3
+ Source: arXiv:2604.05018, App. D.3.
4
+
5
+ ## The 90% rule
6
+
7
+ > ...the system strictly constrains the model to cite only the provided
8
+ > verified papers, explicitly mandating that at least 90% of the gathered
9
+ > literature pool must be actively integrated and cited when synthesizing
10
+ > the Introduction and Related Work sections.
11
+
12
+ Why: this is the paper's core defense against citation inflation. The
13
+ literature review pool is built once via the rigorous discovery →
14
+ verification → dedup pipeline. The writing step must then *use* almost all
15
+ of it. This prevents the agent from gathering 50 papers and citing only the
16
+ 3 most famous ones, which would defeat the entire literature search.
17
+
18
+ ## Implementation
19
+
20
+ After the Lit Review writing call produces `intro_relwork.tex`:
21
+
22
+ ```bash
23
+ python scripts/citation_coverage.py \
24
+ --tex workspace/drafts/intro_relwork.tex \
25
+ --pool workspace/citation_pool.json \
26
+ --threshold 0.90
27
+ ```
28
+
29
+ The script:
30
+
31
+ 1. Reads `citation_pool.json` and counts `papers[]` (= N).
32
+ 2. Computes `min_required = floor(0.90 * N)`.
33
+ 3. Greps `intro_relwork.tex` for all `\cite{KEY}`, `\citep{KEY}`, `\citet{KEY}`,
34
+ `\autocite{KEY}`, `\citeauthor{KEY}`, etc.
35
+ 4. Counts the **unique** keys actually cited.
36
+ 5. Reports `cited / N` and exits non-zero if `cited < min_required`.
37
+
38
+ ## What to do on failure
39
+
40
+ The script prints the missing keys grouped by `discovered_for` cluster:
41
+
42
+ ```
43
+ FAIL: 17/22 papers cited (77.3%, need ≥90%)
44
+ Uncited papers (5):
45
+ - vaswani2017attention [discovered_for: intro] (Attention Is All You Need)
46
+ - he2016deep [discovered_for: intro] (Deep Residual Learning ...)
47
+ - liu2024video [discovered_for: related_work[2.1]] (Long Video Generation ...)
48
+ - chen2024sparse [discovered_for: related_work[2.2]] (Sparse Attention Surveys ...)
49
+ - kim2024transformer [discovered_for: related_work[2.2]] (Transformer Scaling Laws ...)
50
+ ```
51
+
52
+ The host agent should then re-call the Lit Review writing step with an
53
+ appended instruction:
54
+
55
+ ```
56
+ The previous draft cited only 17 out of 22 verified papers (77.3%, threshold
57
+ is 90%). You MUST integrate the following 5 papers into the appropriate
58
+ sections:
59
+ - vaswani2017attention (intro): foundational attention reference
60
+ - he2016deep (intro): foundational ResNet reference
61
+ - liu2024video (related work 2.1): direct competing approach for long video
62
+ - chen2024sparse (related work 2.2): sparse attention survey, group with [...]
63
+ - kim2024transformer (related work 2.2): scaling-laws context
64
+
65
+ Do not remove any existing citations. Add new ones where contextually
66
+ appropriate. Re-emit the full template.tex with both sections updated.
67
+ ```
68
+
69
+ After 2-3 re-prompts, if coverage still falls short, the pipeline should
70
+ emit a warning and proceed — the paper does not specify a hard halt on this,
71
+ only a strong constraint.
.scider/skills/literature-review-agent/references/discovery-pipeline.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Discovery Pipeline (Phase 1 + Phase 2)
2
+
3
+ Source: arXiv:2604.05018, App. D.3 ("Citation Verification") and App. B
4
+ (LLM-call distribution).
5
+
6
+ ## Phase 1 — Parallel Candidate Discovery
7
+
8
+ The paper uses 10 concurrent workers to fan out search-grounded LLM calls
9
+ ("Gemini-3-Flash with Google Search grounding"). For our host-agent
10
+ implementation, the equivalent is: spawn up to 10 concurrent search queries
11
+ using the host's native web search tool.
12
+
13
+ ### Inputs
14
+
15
+ From `outline.json`:
16
+
17
+ ```
18
+ introduction_strategy:
19
+ search_directions: [q1, q2, q3] # 3-5 queries
20
+ related_work_strategy:
21
+ subsections:
22
+ - methodology_cluster: "..."
23
+ sota_investigation_mission: "..." # 1 derived query
24
+ limitation_search_queries: [q4, q5] # 1-3 queries
25
+ - ...
26
+ ```
27
+
28
+ Total query budget: typically 10-20 queries per paper.
29
+
30
+ ### Per-query procedure
31
+
32
+ For each search query, instruct your host's search tool:
33
+
34
+ ```
35
+ search("<query>", num_results=10)
36
+ ```
37
+
38
+ Or, if you've enabled the optional Exa backend (see `exa-search-cookbook.md`):
39
+
40
+ ```bash
41
+ python scripts/exa_search.py --query "<query>" --num-results 10
42
+ ```
43
+
44
+ Both paths produce the same normalized candidate format. Collect the top
45
+ 10 results per query. Each result should yield:
46
+
47
+ - `title` — the paper's title from the search snippet
48
+ - `snippet` — the abstract preview from the search snippet
49
+ - `source_url` — the result URL (often the arXiv abstract page)
50
+
51
+ Tag each result with `discovered_for: ["intro"]` or
52
+ `discovered_for: ["related_work[2.1]"]` so you can later trace which cluster
53
+ each citation supports.
54
+
55
+ Combine all results across all queries into a single `raw_candidates.json`:
56
+
57
+ ```json
58
+ {
59
+ "candidates": [
60
+ {
61
+ "title": "Attention Is All You Need",
62
+ "snippet": "The dominant sequence transduction models...",
63
+ "source_url": "https://arxiv.org/abs/1706.03762",
64
+ "discovered_for": ["intro"]
65
+ },
66
+ ...
67
+ ]
68
+ }
69
+ ```
70
+
71
+ ## Phase 2 — Sequential Verification via Semantic Scholar
72
+
73
+ The paper enforces strict sequential verification at ≤1 QPS via the public
74
+ Semantic Scholar API. We follow the same constraint.
75
+
76
+ ### Per-candidate procedure
77
+
78
+ 1. **Search S2 by title**. Use the host's URL fetch tool:
79
+ ```
80
+ GET https://api.semanticscholar.org/graph/v1/paper/search
81
+ ?query=<URL-encoded(title)>
82
+ &limit=5
83
+ &fields=title,abstract,year,authors,venue,externalIds
84
+ ```
85
+ No API key required for the public endpoint. Be polite: 1 QPS.
86
+
87
+ 2. **Take the top hit**. Compare `title` to the candidate `title` via the
88
+ helper:
89
+ ```bash
90
+ python scripts/levenshtein_match.py --candidate "..." --found "..."
91
+ ```
92
+ The helper prints an integer 0-100 (the Levenshtein ratio).
93
+ - **< 70 → discard the candidate.** Move on.
94
+ - **≥ 70 → continue to checks 3-5.**
95
+
96
+ 3. **Check abstract presence**. If `abstract` is null or empty → discard.
97
+ The paper requires every cited entity to have a retrievable abstract for
98
+ downstream context enrichment in the Section Writing Agent.
99
+
100
+ 4. **Check temporal cutoff**:
101
+ ```bash
102
+ python scripts/check_cutoff.py \
103
+ --paper-year <year> \
104
+ --paper-month <month or omit> \
105
+ --cutoff <YYYY-MM-DD>
106
+ ```
107
+ Exit 0 if strictly predates; exit 1 if not. Discard on exit 1.
108
+
109
+ 5. **Year-alignment bonus**. If the candidate's `discovered_for` query
110
+ mentioned a specific year and the S2 hit's year matches exactly, record
111
+ `match_score = ratio + 5`. (This is a soft bonus used for tie-breaking
112
+ when two candidates dedup to similar entries.)
113
+
114
+ 6. **Append to verified pool** if all checks pass. Record:
115
+ ```json
116
+ {
117
+ "paperId": "abc123...",
118
+ "title": "...",
119
+ "abstract": "...",
120
+ "year": 2017,
121
+ "venue": "NeurIPS",
122
+ "authors": [{"name": "A. Vaswani"}, ...],
123
+ "externalIds": {"DOI": "...", "ArXiv": "1706.03762"},
124
+ "match_score": 100,
125
+ "discovered_for": ["intro"]
126
+ }
127
+ ```
128
+
129
+ ### Rate-limit etiquette
130
+
131
+ The S2 public endpoint enforces ~1 QPS without an API key. If you receive
132
+ HTTP 429, sleep 5 seconds and retry. Do not parallelize Phase 2 — verification
133
+ must be strictly sequential.
134
+
135
+ If your host has the patience for it, the paper measures ~20-30 LLM/API calls
136
+ total per Lit Review Agent invocation. With ~30 candidates that's roughly
137
+ 30 seconds of verification wall-time. With 100 candidates it's ~100 seconds.
138
+
139
+ ## Why two phases
140
+
141
+ The split exists because:
142
+
143
+ - **Discovery is high-throughput, low-stakes**. You want to cast a wide net
144
+ fast. Search APIs accept high concurrency.
145
+ - **Verification is low-throughput, high-stakes**. The S2 API protects
146
+ itself with QPS limits, and the verification step is what keeps the paper
147
+ honest. Faking a citation is trivially easy without it.
148
+
149
+ The paper's design "successfully combines the high-concurrency tolerance of
150
+ the LLM API with the strict throughput limits of the Semantic Scholar API to
151
+ prevent quota-induced latency" (App. B).
.scider/skills/literature-review-agent/references/exa-search-cookbook.md ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exa Search Cookbook (optional Phase 1 backend)
2
+
3
+ [Exa](https://exa.ai) is a search engine optimized for finding academic
4
+ papers and other high-quality content. The `literature-review-agent` can
5
+ use Exa as an **OPTIONAL** backend for Phase 1 candidate discovery — useful
6
+ when your host coding agent has no native web search tool, or when you
7
+ want a research-paper-focused search backend with better signal-to-noise
8
+ than general web search.
9
+
10
+ > **Exa is opt-in.** The literature-review-agent's default Phase 1 path is
11
+ > "use your host agent's native web search tool" (`WebSearch` in Claude
12
+ > Code, `@web` in Cursor, the search tool in Antigravity, etc.). That
13
+ > requires zero configuration and no API key. Use Exa only if you want
14
+ > to.
15
+
16
+ ## Why use it
17
+
18
+ Exa fills three gaps:
19
+
20
+ 1. **Hosts with no built-in search.** Aider, OpenCode, and generic CLI
21
+ agents often lack a native web search tool. Exa gives them one.
22
+ 2. **Research-paper-focused results.** Exa's `category: "research paper"`
23
+ filter returns higher signal-to-noise than general web search for
24
+ academic queries. The example response (e.g., for the query
25
+ "PaperOrchestra") returns arXiv pages, conference proceedings, and
26
+ academic tools rather than general SEO content.
27
+ 3. **Batch / non-interactive runs.** When you want a deterministic,
28
+ scriptable backend rather than going through the host agent's tool
29
+ interface.
30
+
31
+ Exa returns 10–20 results per call (the helper clamps to that range), and
32
+ each result includes a `title`, `url`, optional `publishedDate`, and a
33
+ list of `highlights` (snippets) which the helper joins into a `snippet`
34
+ field consumable by the rest of the Phase 1 pipeline.
35
+
36
+ ## Get a key
37
+
38
+ 1. Sign up at <https://dashboard.exa.ai/>.
39
+ 2. Copy your API key (format: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`).
40
+ 3. Set it in your environment:
41
+
42
+ ```bash
43
+ export EXA_API_KEY="paste-key-here"
44
+ ```
45
+
46
+ Or put it in a `.env` file (which is gitignored — the repo `.gitignore`
47
+ blocks `*.env` and `.env*` patterns) and source it:
48
+
49
+ ```bash
50
+ set -a; source .env; set +a
51
+ ```
52
+
53
+ **This repo never commits a key.** The helper reads `EXA_API_KEY` from the
54
+ environment at runtime. The key is your responsibility to provision and
55
+ secure.
56
+
57
+ ## Run the helper
58
+
59
+ ```bash
60
+ python skills/literature-review-agent/scripts/exa_search.py \
61
+ --query "Sparse attention long context transformers" \
62
+ --num-results 15 \
63
+ --discovered-for "related_work[2.1]"
64
+ ```
65
+
66
+ Output (default — normalized to the literature-review-agent candidate
67
+ format):
68
+
69
+ ```json
70
+ {
71
+ "candidates": [
72
+ {
73
+ "title": "Longformer: The Long-Document Transformer",
74
+ "snippet": "We present the Longformer, a self-attention mechanism that scales linearly with sequence length...",
75
+ "source_url": "https://arxiv.org/abs/2004.05150",
76
+ "discovered_for": ["related_work[2.1]"],
77
+ "_exa_id": "https://arxiv.org/abs/2004.05150",
78
+ "_exa_published_date": "2020-04-10T00:00:00.000Z"
79
+ },
80
+ ...
81
+ ]
82
+ }
83
+ ```
84
+
85
+ This JSON can be merged directly into `workspace/raw_candidates.json`
86
+ before the Phase 2 sequential verification step.
87
+
88
+ ### Useful flags
89
+
90
+ | Flag | Default | Purpose |
91
+ |---|---|---|
92
+ | `--query` | (required) | Search query string |
93
+ | `--num-results` | `10` | 1–20; the helper clamps to this range |
94
+ | `--category` | `"research paper"` | Pass `""` to disable category filtering for broader results |
95
+ | `--highlight-chars` | `4000` | Max characters per highlight (Exa parameter) |
96
+ | `--discovered-for` | `"intro"` | Tag attached to each candidate; use `"related_work[2.1]"` for cluster queries |
97
+ | `--raw` | off | Print the full Exa response JSON instead of normalized candidates |
98
+
99
+ ## Direct curl recipe
100
+
101
+ If you'd rather not use the Python helper (for one-off testing, or to
102
+ invoke from a host agent's `Bash` / `WebFetch` tool directly):
103
+
104
+ ```bash
105
+ curl -X POST https://api.exa.ai/search \
106
+ --header "content-type: application/json" \
107
+ --header "x-api-key: $EXA_API_KEY" \
108
+ --data '{
109
+ "query": "PaperOrchestra automated paper writing",
110
+ "category": "research paper",
111
+ "numResults": 10,
112
+ "type": "auto",
113
+ "contents": {
114
+ "highlights": {
115
+ "maxCharacters": 4000
116
+ }
117
+ }
118
+ }'
119
+ ```
120
+
121
+ The `$EXA_API_KEY` reference assumes the key is in your shell env. **Do
122
+ not** paste the literal key into the curl command in shell history or
123
+ chat — use the env var.
124
+
125
+ ## Response shape
126
+
127
+ ```json
128
+ {
129
+ "requestId": "52fcb70256224863b33f356fdae37c7f",
130
+ "resolvedSearchType": "neural",
131
+ "results": [
132
+ {
133
+ "id": "https://arxiv.org/abs/2604.05018",
134
+ "title": "PaperOrchestra: A Multi-Agent Framework for ...",
135
+ "url": "https://arxiv.org/abs/2604.05018",
136
+ "publishedDate": "2026-04-06T00:00:00.000Z",
137
+ "highlights": ["...", "..."],
138
+ "highlightScores": [0.4, 0.3],
139
+ "image": "https://...",
140
+ "favicon": "https://..."
141
+ }
142
+ ],
143
+ "searchTime": 975.2,
144
+ "costDollars": {
145
+ "total": 0.007,
146
+ "search": {"neural": 0.007}
147
+ }
148
+ }
149
+ ```
150
+
151
+ ## Mapping Exa → literature-review-agent candidate format
152
+
153
+ Phase 2 verification (Semantic Scholar fuzzy match → cutoff check → dedup)
154
+ expects candidates in this shape:
155
+
156
+ ```json
157
+ {
158
+ "title": "...",
159
+ "snippet": "...",
160
+ "source_url": "...",
161
+ "discovered_for": ["intro"]
162
+ }
163
+ ```
164
+
165
+ `exa_search.py --normalize` (the default mode) does this mapping:
166
+
167
+ | Exa field | Candidate field |
168
+ |---|---|
169
+ | `result.title` | `title` |
170
+ | `result.url` (fallback `result.id`) | `source_url` |
171
+ | `result.highlights` joined and capped at 1500 chars | `snippet` |
172
+ | `--discovered-for` flag | `discovered_for` |
173
+ | `result.id` | `_exa_id` (preserved for debugging) |
174
+ | `result.publishedDate` | `_exa_published_date` (preserved for tie-breaking) |
175
+
176
+ Phase 2 verification still goes through Semantic Scholar regardless of
177
+ whether the candidate came from Exa or from the host's native search.
178
+ Exa is ONLY a discovery backend; the verification chain
179
+ (`levenshtein_match.py` → `check_cutoff.py` → `dedupe_by_id.py` →
180
+ `bibtex_format.py` → `citation_coverage.py`) is unchanged.
181
+
182
+ ## Query patterns
183
+
184
+ Match the literature-review-agent's outline-driven query design. Run one
185
+ Exa call per query, then merge all candidate lists:
186
+
187
+ | Query type | Source in `outline.json` | Example query | `--discovered-for` |
188
+ |---|---|---|---|
189
+ | Macro context | `introduction_strategy.search_directions[i]` | `"Survey of long-context attention mechanisms 2020-2024"` | `"intro"` |
190
+ | Foundational | same | `"Foundational papers transformer self-attention scaling laws"` | `"intro"` |
191
+ | SOTA scan | `related_work_strategy.subsections[i].sota_investigation_mission` | `"Recent SOTA sparse attention transformers 2024"` | `"related_work[2.1]"` |
192
+ | Limitation hunt | `related_work_strategy.subsections[i].limitation_search_queries[j]` | `"Block-sparse attention failure modes long sequences"` | `"related_work[2.1]"` |
193
+
194
+ For the related-work cluster queries, the `--discovered-for` tag matters
195
+ — the downstream `citation_coverage.py` gate uses it to attribute each
196
+ citation to the right cluster when reporting which papers were not yet
197
+ integrated.
198
+
199
+ ## Cost and rate limits
200
+
201
+ Exa pricing is per-query (~$0.007 per neural search at the time of
202
+ writing). For a typical paper with ~15-20 search queries (3-5 intro
203
+ queries + 10-15 related-work queries), one full Lit Review Agent run
204
+ costs ~$0.10-$0.15. Check <https://exa.ai/pricing> for current rates.
205
+
206
+ Exa's rate limits are generous; the paper's 10-worker parallel discovery
207
+ pattern is well within them. The pipeline's wall-time floor is still set
208
+ by Semantic Scholar's 1 QPS verification limit, not by Exa.
209
+
210
+ ## Security
211
+
212
+ - **NEVER commit `EXA_API_KEY` to git.** The repo's `.gitignore` blocks
213
+ `.env`, `*.env`, and `secrets.json` patterns. Keep your key in your
214
+ shell environment or your secrets manager (1Password CLI, op, doppler,
215
+ etc.).
216
+ - The helper reads the key from the environment only. It does NOT accept
217
+ the key as a command-line argument (which would expose it in shell
218
+ history).
219
+ - Exa logs requests for billing and quality. Assume your queries are not
220
+ private to Exa themselves. Don't include sensitive draft text in
221
+ queries.
222
+
223
+ ## Troubleshooting
224
+
225
+ | Symptom | Likely cause | Fix |
226
+ |---|---|---|
227
+ | `ERROR: EXA_API_KEY environment variable not set` | env var missing | `export EXA_API_KEY="..."` |
228
+ | `ERROR: Exa HTTP 401` | invalid or expired key | check the dashboard for the current key |
229
+ | `ERROR: Exa HTTP 429` | rate-limited | back off, lower concurrency |
230
+ | `WARN: Exa returned 0 results` | query too narrow or odd category | broaden the query or try `--category ""` |
231
+ | `Exa network error` | no internet, DNS issue | check your connection; the helper uses urllib stdlib only, no proxy support |
232
+
233
+ ## When to prefer Exa vs the host's native search
234
+
235
+ | Use case | Recommended backend |
236
+ |---|---|
237
+ | Claude Code, Cursor, Antigravity (have native web search) | host's native search (free, integrated) |
238
+ | Aider, OpenCode, generic CLI agents | Exa (gives them search) |
239
+ | Batch reproducible runs | Exa (deterministic backend) |
240
+ | Research-paper-heavy queries | Exa (better academic signal) |
241
+ | One-off interactive runs | host's native search (less friction) |
242
+
243
+ You can also mix: use the host's web search for the broad intro queries
244
+ and Exa for the narrow limitation-search queries where the
245
+ research-paper-category filter helps the most.
.scider/skills/literature-review-agent/references/prompt.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Literature Review Agent — verbatim prompt
2
+
3
+ **Source: arXiv:2604.05018, Appendix F.1, page 46 (verbatim).**
4
+
5
+ This is the exact prompt used by the Literature Review Agent in the paper.
6
+ Use it as your system message when drafting Introduction and Related Work.
7
+ Substitute the placeholders before sending. The Anti-Leakage Prompt
8
+ (`../paper-orchestra/references/anti-leakage-prompt.md`) MUST be prepended.
9
+
10
+ ---
11
+
12
+ ```
13
+ Role: Senior AI Researcher.
14
+
15
+ Task: Write the introduction and related work section of a paper.
16
+
17
+ You will be given a template.tex, this is the initial skeleton we outlined for
18
+ you. Your job is to fill in two sections: Introduction and Related Work.
19
+ Leave all the other sections untouched.
20
+
21
+ Inputs:
22
+ - intro_related_work_plan: This is your PRIMARY guide for structure and
23
+ arguments.
24
+ - project_idea and project_experimental_log: Use them to ensure the Intro
25
+ accurately frames the technical contribution and results.
26
+ - citation_checklist: This includes the citation keys that you should use
27
+ when citing relevant papers.
28
+ - collected_papers: These are all the relevant papers we collect for you for
29
+ citation purpose.
30
+
31
+ YOU MUST ONLY CITE THE GIVEN collected_papers, DO NOT cite new papers other
32
+ than the given papers.
33
+
34
+ Citation Requirements:
35
+ - You have access to the abstract of {paper_count} collected papers.
36
+ - You MUST cite at least {min_cite_paper_count} of them across the
37
+ introduction and related work sections.
38
+ - Introduction: Cite key statistics, foundational models (CLIP, etc.), and
39
+ broad problem statements.
40
+ - Related Work: Do deep comparative citations. Group distinct works (e.g.,
41
+ "Several methods [A, B, C]...").
42
+ - Ensure every \cite{{key}} corresponds exactly to a key in
43
+ citation_checklist.
44
+ - CRITICAL TIMELINE RULE: Do not treat any papers published after
45
+ {cutoff_date} as prior baselines to beat. Treat them strictly as
46
+ concurrent work.
47
+ - CRITICAL EVALUATION RULE: Do not claim our method beats or achieves
48
+ State-of-the-Art over a specific cited paper UNLESS that paper is
49
+ explicitly evaluated against in project_experimental_log. Frame other
50
+ recent papers strictly as concurrent, orthogonal, or conceptual work.
51
+ - You need to return the full code for the new template.tex, where the two
52
+ empty sections (Introduction and Related Work) are now filled in, while
53
+ all the other code (packages, styles, and other sections) are identical
54
+ to the original template.tex.
55
+
56
+ Important Note:
57
+ DO NOT change \usepackage[capitalize]{{cleveref}} into
58
+ \usepackage[capitalize]{{cleverref}}, as there's no cleverref.sty.
59
+
60
+ Output Format:
61
+ You must return the code for the updated template.tex. Make sure to wrap the
62
+ code with ```latex content ```.
63
+ ```
64
+
65
+ ---
66
+
67
+ ## Placeholder substitution table
68
+
69
+ | Placeholder | Source |
70
+ |---|---|
71
+ | `{paper_count}` | `len(citation_pool.papers)` from `workspace/citation_pool.json` |
72
+ | `{min_cite_paper_count}` | `floor(0.9 * paper_count)` — the ≥90% rule |
73
+ | `{cutoff_date}` | Derived from `conference_guidelines.md` — see App. D.1 of the paper |
74
+
75
+ The other placeholders (`intro_related_work_plan`, `project_idea`,
76
+ `project_experimental_log`, `citation_checklist`, `collected_papers`) are
77
+ substituted by passing their full file/JSON contents into the user message.
.scider/skills/literature-review-agent/references/s2-api-cookbook.md ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Semantic Scholar API Cookbook
2
+
3
+ How to verify a candidate paper via the Semantic Scholar Graph API.
4
+
5
+ Base: `https://api.semanticscholar.org/graph/v1`
6
+
7
+ Reference: <https://api.semanticscholar.org/api-docs/graph>
8
+
9
+ ## API key (optional)
10
+
11
+ The pipeline uses the **public, unauthenticated endpoint** by default — no key
12
+ required. If you have a Semantic Scholar API key you can pass it via the
13
+ `x-api-key` header to get higher rate limits (useful for large batches).
14
+
15
+ Get a free key at <https://api.semanticscholar.org/> then export it once:
16
+
17
+ ```bash
18
+ export SEMANTIC_SCHOLAR_API_KEY="your-key-here"
19
+ ```
20
+
21
+ The bundled `scripts/s2_search.py` helper picks this up automatically. If the
22
+ variable is not set the script falls back to the unauthenticated endpoint — the
23
+ pipeline works fine either way; just keep to ≤1 QPS on live requests.
24
+
25
+ ```bash
26
+ # check whether the key is configured
27
+ python skills/literature-review-agent/scripts/s2_search.py --check-key
28
+
29
+ # search by title (key used automatically if set)
30
+ python skills/literature-review-agent/scripts/s2_search.py \
31
+ --query "Attention is All You Need" --limit 5
32
+
33
+ # print the raw S2 JSON
34
+ python skills/literature-review-agent/scripts/s2_search.py \
35
+ --query "BERT pre-training" --raw
36
+ ```
37
+
38
+ The repo never commits a key. Key management is your responsibility (shell
39
+ environment, 1Password, doppler, etc.).
40
+
41
+ ## Endpoint 1 — Search by title
42
+
43
+ ```
44
+ GET /paper/search
45
+ ?query=<URL-encoded title>
46
+ &limit=5
47
+ &fields=title,abstract,year,authors,venue,externalIds
48
+ ```
49
+
50
+ Example:
51
+
52
+ ```
53
+ GET https://api.semanticscholar.org/graph/v1/paper/search?query=Attention%20Is%20All%20You%20Need&limit=5&fields=title,abstract,year,authors,venue,externalIds
54
+ ```
55
+
56
+ Response (truncated):
57
+
58
+ ```json
59
+ {
60
+ "total": 12345,
61
+ "data": [
62
+ {
63
+ "paperId": "204e3073870fae3d05bcbc2f6a8e263d9b72e776",
64
+ "title": "Attention is All you Need",
65
+ "abstract": "The dominant sequence transduction models are based on...",
66
+ "year": 2017,
67
+ "venue": "NeurIPS",
68
+ "authors": [{"name": "Ashish Vaswani"}, ...],
69
+ "externalIds": {
70
+ "DBLP": "conf/nips/VaswaniSPUJGKP17",
71
+ "ArXiv": "1706.03762",
72
+ "DOI": "10.5555/3295222.3295349"
73
+ }
74
+ },
75
+ ...
76
+ ]
77
+ }
78
+ ```
79
+
80
+ ## Endpoint 2 — Get a specific paper by ID
81
+
82
+ ```
83
+ GET /paper/<paperId>?fields=title,abstract,year,authors,venue,externalIds,citationCount
84
+ ```
85
+
86
+ ## Useful identifiers
87
+
88
+ You can pass these as `<paperId>`:
89
+
90
+ - S2 internal: `204e3073870fae3d05bcbc2f6a8e263d9b72e776`
91
+ - DOI: `DOI:10.18653/v1/N18-3011`
92
+ - ArXiv: `ARXIV:1706.03762`
93
+ - Corpus ID: `CorpusId:13756489`
94
+ - URL: `URL:https://arxiv.org/abs/1706.03762`
95
+
96
+ ## Rate limits
97
+
98
+ - Unauthenticated: ~1 QPS sustained. Bursts will get 429.
99
+ - Per the paper, "the strict throughput limits of the Semantic Scholar API
100
+ (1 query per second)" — App. B.
101
+
102
+ If you get HTTP 429, sleep 5 seconds before retrying. Don't loop tightly.
103
+
104
+ ## Fields cheat sheet
105
+
106
+ | Field | Type | Required by our pipeline? |
107
+ |---|---|---|
108
+ | `paperId` | string | yes (dedup key) |
109
+ | `title` | string | yes (Levenshtein match) |
110
+ | `abstract` | string | yes (rule 2: must exist) |
111
+ | `year` | int | yes (cutoff check) |
112
+ | `authors[].name` | string | yes (BibTeX author field) |
113
+ | `venue` | string | recommended (BibTeX journal/booktitle) |
114
+ | `externalIds.DOI` | string | recommended (dedup fallback, BibTeX doi) |
115
+ | `externalIds.ArXiv` | string | recommended (dedup fallback) |
116
+ | `publicationDate` | string `YYYY-MM-DD` | optional (more precise cutoff check) |
117
+ | `citationCount` | int | optional (could inform tie-breaking) |
118
+
119
+ Always pass `fields=...` explicitly — the default response is minimal and
120
+ will not include the abstract.
121
+
122
+ ## Error handling
123
+
124
+ | Status | Meaning | What to do |
125
+ |---|---|---|
126
+ | 200 | OK | proceed |
127
+ | 400 | bad query syntax | URL-encode the title properly; retry once |
128
+ | 404 | not found | discard the candidate |
129
+ | 429 | rate limited | sleep 5s, retry |
130
+ | 500-503 | S2 down | sleep 30s, retry up to 3 times, then give up |
131
+
132
+ ## Polite use
133
+
134
+ The S2 API is a public service. Do not hammer it. If you have many candidates:
135
+
136
+ - Throttle to 1 QPS.
137
+ - Cache hits (the dedup script already serves as a deduplication cache).
138
+ - Do not parallelize. Verification is sequential by design.
.scider/skills/literature-review-agent/references/verification-rules.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Verification Rules
2
+
3
+ Source: arXiv:2604.05018, App. D.3 ("Citation Verification"), verbatim
4
+ specifications below.
5
+
6
+ ## Rule 1 — Fuzzy title match (Levenshtein > 70)
7
+
8
+ > Each candidate must resolve to a valid Semantic Scholar entity via a fuzzy
9
+ > title match (Levenshtein distance ratio > 70 [Levenshtein, 1965]),
10
+ > augmented by a point bonus for exact year alignment.
11
+
12
+ Implementation: `scripts/levenshtein_match.py` uses
13
+ `Levenshtein.ratio(a, b) * 100` from the `python-Levenshtein` package and
14
+ returns the integer ratio. Threshold: **strictly greater than 70**.
15
+
16
+ Examples:
17
+
18
+ | Candidate title | S2 title | Ratio | Verdict |
19
+ |---|---|---|---|
20
+ | "Attention Is All You Need" | "Attention Is All You Need" | 100 | accept |
21
+ | "Attention Is All You Need" | "Attention is All You Need." | 96 | accept |
22
+ | "Sparse Attention for Transformers" | "Sparse Attention in Transformers" | 88 | accept |
23
+ | "Self-Attention" | "Attention Is All You Need" | 47 | reject |
24
+ | "Linformer" | "Linformer: Self-Attention with Linear Complexity" | 28 | reject |
25
+
26
+ The Linformer case is the canonical false-negative: a short query against
27
+ a long title. Workaround: when the candidate title looks abbreviated
28
+ (< 4 words) and the S2 hit's title contains the candidate as a substring,
29
+ override the ratio check. The paper does not specify this workaround
30
+ explicitly; we add it as a soft safety net to avoid losing legitimate
31
+ short-title hits. See `levenshtein_match.py --substring-bypass`.
32
+
33
+ ## Rule 2 — Abstract must exist
34
+
35
+ > To enter the final context pool, the entity must possess a retrievable
36
+ > abstract...
37
+
38
+ Discard any verified hit where `abstract` is null, empty, or `"N/A"`. The
39
+ Section Writing Agent uses the abstract to ground its citations contextually
40
+ (per the Section Writing Agent prompt: "Read the abstract provided in
41
+ citation_map.json for the papers you are citing. Use this context to write
42
+ accurate, specific sentences about those works.").
43
+
44
+ ## Rule 3 — Strict temporal cutoff
45
+
46
+ > ...and strictly predate the research cutoff (when specified down to the
47
+ > month, the system defaults to the first day of that month).
48
+
49
+ Implementation: `scripts/check_cutoff.py`. Comparison rules:
50
+
51
+ - Cutoff is given as `YYYY-MM-DD`. The paper aligns it to venue submission
52
+ deadline (Nov 2024 for CVPR 2025, Oct 2024 for ICLR 2025 — App. D.1).
53
+ - Paper year is required. Paper month is optional.
54
+ - If paper has only year: assume month=12, day=31 (worst case for the paper —
55
+ must still be < cutoff).
56
+ - If paper has year + month: assume day=1 of that month.
57
+ - "Strictly predate" means `paper_date < cutoff_date`. Equality fails.
58
+
59
+ Examples (cutoff = 2024-10-01):
60
+
61
+ | Paper year | Paper month | Verdict |
62
+ |---|---|---|
63
+ | 2017 | — | accept |
64
+ | 2024 | 9 | accept (2024-09-01 < 2024-10-01) |
65
+ | 2024 | 10 | reject (2024-10-01 not strictly < 2024-10-01) |
66
+ | 2024 | — (only year) | reject (2024-12-31 ≥ 2024-10-01) |
67
+
68
+ The strict comparison is intentional: it prevents leakage of papers from
69
+ the same submission cycle as the target venue.
70
+
71
+ ## Rule 4 — Dedup by Semantic Scholar paperId
72
+
73
+ > Finally, gathered citations are deduplicated using unique paper ID keys.
74
+
75
+ Implementation: `scripts/dedupe_by_id.py`. Key precedence:
76
+
77
+ 1. `paperId` (S2's internal unique ID, always present on a verified hit)
78
+ 2. `externalIds.DOI` (lowercased)
79
+ 3. `externalIds.ArXiv` (without version suffix)
80
+ 4. Normalized title (lowercased, alphanumeric only) — fallback only
81
+
82
+ When two candidates collide, keep the one with the higher `match_score`.
83
+
84
+ ## Rule 5 — ≥90% citation integration
85
+
86
+ > The system constrains the model to cite only the provided verified papers,
87
+ > explicitly mandating that at least 90% of the gathered literature pool must
88
+ > be actively integrated and cited when synthesizing the Introduction and
89
+ > Related Work sections.
90
+
91
+ Implementation: `scripts/citation_coverage.py`. After the Lit Review writing
92
+ call produces `intro_relwork.tex`, this script:
93
+
94
+ 1. Extracts every `\cite{KEY}` and `\citep{KEY}` (and variants) from the
95
+ `.tex` file.
96
+ 2. Counts unique cited keys against `len(citation_pool.papers)`.
97
+ 3. Requires `cited / total ≥ 0.90`. Exits non-zero if not.
98
+
99
+ If the gate fails, the host agent must re-prompt the writing step,
100
+ explicitly listing the un-cited keys and asking the agent to integrate them.
.scider/skills/literature-review-agent/scripts/bibtex_format.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ bibtex_format.py — Generate refs.bib from a verified citation pool.
4
+
5
+ Reads citation_pool.json (output of dedupe_by_id.py) and emits a BibTeX file
6
+ with deterministic citation keys derived from the first author + year +
7
+ first significant title word.
8
+
9
+ Never invents fields. Only writes fields that are actually present in the
10
+ S2 metadata. Writes one of:
11
+ @article{ ... } — when venue looks like a journal
12
+ @inproceedings{ ... }— when venue looks like a conference
13
+ @misc{ ... } — fallback (e.g., arXiv-only papers)
14
+
15
+ Usage:
16
+ python bibtex_format.py --pool citation_pool.json --out refs.bib
17
+ """
18
+ import argparse
19
+ import json
20
+ import re
21
+ import sys
22
+
23
+ CONFERENCE_HINTS = {
24
+ "neurips",
25
+ "nips",
26
+ "icml",
27
+ "iclr",
28
+ "cvpr",
29
+ "iccv",
30
+ "eccv",
31
+ "aaai",
32
+ "ijcai",
33
+ "acl",
34
+ "emnlp",
35
+ "naacl",
36
+ "kdd",
37
+ "www",
38
+ "sigir",
39
+ "uai",
40
+ "siggraph",
41
+ "interspeech",
42
+ "icassp",
43
+ "miccai",
44
+ "wacv",
45
+ "bmvc",
46
+ "coling",
47
+ "conll",
48
+ }
49
+ STOPWORDS = {
50
+ "a",
51
+ "an",
52
+ "and",
53
+ "the",
54
+ "of",
55
+ "for",
56
+ "to",
57
+ "with",
58
+ "on",
59
+ "in",
60
+ "by",
61
+ "from",
62
+ "as",
63
+ "is",
64
+ "are",
65
+ "be",
66
+ "via",
67
+ "into",
68
+ "their",
69
+ "our",
70
+ "we",
71
+ "this",
72
+ "that",
73
+ "using",
74
+ "use",
75
+ "about",
76
+ "at",
77
+ "or",
78
+ "if",
79
+ }
80
+
81
+
82
+ def normalize(s: str) -> str:
83
+ return re.sub(r"[^a-z]", "", s.lower())
84
+
85
+
86
+ def first_significant_word(title: str) -> str:
87
+ for w in re.findall(r"[A-Za-z][A-Za-z\-]*", title):
88
+ wn = w.lower()
89
+ if wn not in STOPWORDS and len(wn) > 2:
90
+ return normalize(wn)
91
+ return "paper"
92
+
93
+
94
+ def first_author_lastname(authors: list[dict]) -> str:
95
+ if not authors:
96
+ return "anon"
97
+ name = authors[0].get("name", "").strip()
98
+ if not name:
99
+ return "anon"
100
+ parts = name.replace(",", "").split()
101
+ return normalize(parts[-1]) or "anon"
102
+
103
+
104
+ def make_key(paper: dict) -> str:
105
+ last = first_author_lastname(paper.get("authors") or [])
106
+ year = paper.get("year") or "0000"
107
+ word = first_significant_word(paper.get("title", ""))
108
+ return f"{last}{year}{word}"
109
+
110
+
111
+ def is_conference(venue: str) -> bool:
112
+ if not venue:
113
+ return False
114
+ v = venue.lower()
115
+ return any(h in v for h in CONFERENCE_HINTS)
116
+
117
+
118
+ def escape_bibtex(s: str) -> str:
119
+ if not s:
120
+ return ""
121
+ return s.replace("{", "\\{").replace("}", "\\}").replace("&", "\\&")
122
+
123
+
124
+ def author_field(authors: list[dict]) -> str:
125
+ names = [a.get("name", "").strip() for a in authors if a.get("name")]
126
+ return " and ".join(escape_bibtex(n) for n in names)
127
+
128
+
129
+ def format_entry(paper: dict, key: str) -> str:
130
+ venue = paper.get("venue") or ""
131
+ if is_conference(venue):
132
+ kind = "inproceedings"
133
+ venue_key = "booktitle"
134
+ elif venue:
135
+ kind = "article"
136
+ venue_key = "journal"
137
+ else:
138
+ kind = "misc"
139
+ venue_key = None
140
+
141
+ lines = [f"@{kind}{{{key},"]
142
+ if title := paper.get("title"):
143
+ lines.append(f" title = {{{escape_bibtex(title)}}},")
144
+ if authors := paper.get("authors"):
145
+ lines.append(f" author = {{{author_field(authors)}}},")
146
+ if year := paper.get("year"):
147
+ lines.append(f" year = {{{year}}},")
148
+ if venue and venue_key:
149
+ lines.append(f" {venue_key:8s} = {{{escape_bibtex(venue)}}},")
150
+ ext = paper.get("externalIds") or {}
151
+ if doi := ext.get("DOI"):
152
+ lines.append(f" doi = {{{doi}}},")
153
+ if arxiv := ext.get("ArXiv"):
154
+ lines.append(f" eprint = {{{arxiv}}},")
155
+ lines.append(f" archivePrefix = {{arXiv}},")
156
+ # Strip trailing comma on last field
157
+ if lines[-1].endswith(","):
158
+ lines[-1] = lines[-1].rstrip(",")
159
+ lines.append("}")
160
+ return "\n".join(lines)
161
+
162
+
163
+ def main() -> int:
164
+ p = argparse.ArgumentParser(description=__doc__)
165
+ p.add_argument("--pool", required=True, help="citation_pool.json")
166
+ p.add_argument("--out", required=True, help="output refs.bib")
167
+ args = p.parse_args()
168
+
169
+ with open(args.pool) as f:
170
+ pool = json.load(f)
171
+ papers = pool.get("papers", [])
172
+ if not papers:
173
+ print("ERROR: pool contains no papers", file=sys.stderr)
174
+ return 1
175
+
176
+ keys_used: dict[str, int] = {}
177
+ entries: list[str] = []
178
+ paper_keys: list[str] = []
179
+
180
+ for paper in papers:
181
+ base_key = make_key(paper)
182
+ # Disambiguate collisions with letter suffix
183
+ if base_key in keys_used:
184
+ keys_used[base_key] += 1
185
+ suffix = chr(ord("a") + keys_used[base_key] - 1)
186
+ key = base_key + suffix
187
+ else:
188
+ keys_used[base_key] = 1
189
+ key = base_key
190
+ paper["bibtex_key"] = key
191
+ paper_keys.append(key)
192
+ entries.append(format_entry(paper, key))
193
+
194
+ with open(args.out, "w") as f:
195
+ f.write("% Generated by paper-orchestra literature-review-agent/bibtex_format.py\n")
196
+ f.write(f"% {len(entries)} entries from citation_pool.json\n\n")
197
+ f.write("\n\n".join(entries))
198
+ f.write("\n")
199
+
200
+ # Write the keys back into the pool so the writing step has the
201
+ # citation_checklist mapping. (Idempotent — overwrites with same data.)
202
+ with open(args.pool, "w") as f:
203
+ json.dump(pool, f, indent=2, ensure_ascii=False)
204
+
205
+ print(f"OK: {len(entries)} BibTeX entries → {args.out}")
206
+ print(f" keys: {', '.join(paper_keys[:5])}{'...' if len(paper_keys) > 5 else ''}")
207
+ return 0
208
+
209
+
210
+ if __name__ == "__main__":
211
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/check_cutoff.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ check_cutoff.py — Strict temporal cutoff check for citation verification.
4
+
5
+ Implements the paper's Rule 3 (App. D.3): a paper passes only if its
6
+ publication date strictly predates the research cutoff. When only the year
7
+ is known, assume the worst case (Dec 31). When year + month are known,
8
+ assume day-1 of that month (per the paper's "first day of that month"
9
+ default).
10
+
11
+ Exit codes:
12
+ 0 paper strictly predates cutoff (PASS)
13
+ 1 paper does not strictly predate cutoff (FAIL)
14
+ 2 argument error
15
+
16
+ Usage:
17
+ python check_cutoff.py --paper-year 2024 --paper-month 9 --cutoff 2024-10-01
18
+ python check_cutoff.py --paper-year 2024 --cutoff 2024-10-01
19
+ python check_cutoff.py --paper-date 2024-09-15 --cutoff 2024-10-01
20
+ """
21
+ import argparse
22
+ import datetime as dt
23
+ import sys
24
+
25
+
26
+ def main() -> int:
27
+ p = argparse.ArgumentParser(description=__doc__)
28
+ p.add_argument("--paper-year", type=int, help="Paper publication year")
29
+ p.add_argument("--paper-month", type=int, help="Paper publication month (1-12), optional")
30
+ p.add_argument("--paper-date", help="Full paper date YYYY-MM-DD, overrides year/month")
31
+ p.add_argument("--cutoff", required=True, help="Research cutoff date YYYY-MM-DD")
32
+ args = p.parse_args()
33
+
34
+ try:
35
+ cutoff = dt.date.fromisoformat(args.cutoff)
36
+ except ValueError:
37
+ print(f"ERROR: --cutoff must be YYYY-MM-DD, got {args.cutoff}", file=sys.stderr)
38
+ return 2
39
+
40
+ if args.paper_date:
41
+ try:
42
+ paper_date = dt.date.fromisoformat(args.paper_date)
43
+ except ValueError:
44
+ print(f"ERROR: --paper-date must be YYYY-MM-DD, got {args.paper_date}", file=sys.stderr)
45
+ return 2
46
+ elif args.paper_year:
47
+ if args.paper_month:
48
+ paper_date = dt.date(args.paper_year, args.paper_month, 1)
49
+ else:
50
+ paper_date = dt.date(args.paper_year, 12, 31)
51
+ else:
52
+ print("ERROR: must provide --paper-date OR --paper-year", file=sys.stderr)
53
+ return 2
54
+
55
+ if paper_date < cutoff:
56
+ print(f"PASS paper={paper_date} < cutoff={cutoff}")
57
+ return 0
58
+ print(f"FAIL paper={paper_date} not strictly before cutoff={cutoff}")
59
+ return 1
60
+
61
+
62
+ if __name__ == "__main__":
63
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/citation_coverage.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ citation_coverage.py — Enforce the paper's ≥90% citation integration rule
4
+ (App. D.3).
5
+
6
+ Greps a generated .tex file for all citation commands, counts the unique
7
+ keys actually cited, and compares against the verified citation pool.
8
+ Exits non-zero if coverage < 90%.
9
+
10
+ Usage:
11
+ python citation_coverage.py --tex intro_relwork.tex --pool citation_pool.json
12
+ python citation_coverage.py --tex intro_relwork.tex --pool citation_pool.json --threshold 0.85
13
+ """
14
+ import argparse
15
+ import json
16
+ import re
17
+ import sys
18
+
19
+ CITE_RE = re.compile(
20
+ r"\\(?:cite|citep|citet|citeauthor|citeyear|autocite|parencite|textcite)"
21
+ r"(?:\[[^\]]*\])?"
22
+ r"\{([^}]+)\}"
23
+ )
24
+
25
+
26
+ def extract_cited_keys(tex: str) -> set[str]:
27
+ keys = set()
28
+ for m in CITE_RE.finditer(tex):
29
+ for k in m.group(1).split(","):
30
+ k = k.strip()
31
+ if k:
32
+ keys.add(k)
33
+ return keys
34
+
35
+
36
+ def main() -> int:
37
+ p = argparse.ArgumentParser(description=__doc__)
38
+ p.add_argument("--tex", required=True, help="LaTeX file to inspect")
39
+ p.add_argument("--pool", required=True, help="citation_pool.json")
40
+ p.add_argument(
41
+ "--threshold",
42
+ type=float,
43
+ default=0.90,
44
+ help="Minimum integration ratio (default 0.90 per paper)",
45
+ )
46
+ args = p.parse_args()
47
+
48
+ with open(args.tex) as f:
49
+ tex = f.read()
50
+ with open(args.pool) as f:
51
+ pool = json.load(f)
52
+
53
+ pool_papers = pool.get("papers", [])
54
+ pool_keys = {p.get("bibtex_key") for p in pool_papers if p.get("bibtex_key")}
55
+ if not pool_keys:
56
+ print("ERROR: pool has no bibtex_keys. Run bibtex_format.py first.", file=sys.stderr)
57
+ return 1
58
+
59
+ cited = extract_cited_keys(tex)
60
+ cited_in_pool = cited & pool_keys
61
+ n_pool = len(pool_keys)
62
+ n_cited = len(cited_in_pool)
63
+ ratio = n_cited / n_pool if n_pool else 0.0
64
+ threshold_n = int(args.threshold * n_pool)
65
+
66
+ print(
67
+ f"Coverage: {n_cited}/{n_pool} = {ratio*100:.1f}% "
68
+ f"(threshold {args.threshold*100:.0f}% = {threshold_n})"
69
+ )
70
+
71
+ # report keys cited but NOT in pool — those are forbidden by the prompt
72
+ foreign = cited - pool_keys
73
+ if foreign:
74
+ print(
75
+ f"\nWARNING: {len(foreign)} cited keys NOT in citation pool "
76
+ f"(violates 'cite ONLY collected_papers' rule):"
77
+ )
78
+ for k in sorted(foreign):
79
+ print(f" - {k}")
80
+
81
+ if n_cited < threshold_n:
82
+ uncited = pool_keys - cited
83
+ print(f"\nFAIL: missing {len(uncited)} pool papers from .tex:")
84
+ # show with title for actionable re-prompting
85
+ title_by_key = {
86
+ p.get("bibtex_key"): p.get("title", "") for p in pool_papers if p.get("bibtex_key")
87
+ }
88
+ discovered_by_key = {
89
+ p.get("bibtex_key"): p.get("discovered_for", [])
90
+ for p in pool_papers
91
+ if p.get("bibtex_key")
92
+ }
93
+ for k in sorted(uncited):
94
+ tag = ",".join(discovered_by_key.get(k, [])) or "?"
95
+ t = title_by_key.get(k, "")
96
+ print(f" - {k:40s} [{tag}] {t[:60]}")
97
+ return 1
98
+
99
+ print("OK: citation coverage meets threshold")
100
+ return 0
101
+
102
+
103
+ if __name__ == "__main__":
104
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/dedupe_by_id.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ dedupe_by_id.py — Deduplicate a verified citation pool using Semantic Scholar
4
+ unique paperId, with DOI / ArXiv / normalized-title fallbacks.
5
+
6
+ Implements the paper's Rule 4 (App. D.3): "gathered citations are
7
+ deduplicated using unique paper ID keys".
8
+
9
+ Also computes `min_cite_paper_count = floor(0.9 * len(papers))` for the
10
+ ≥90% citation integration rule.
11
+
12
+ Usage:
13
+ python dedupe_by_id.py --in raw_pool.json --out citation_pool.json [--cutoff 2024-10-01]
14
+ """
15
+ import argparse
16
+ import json
17
+ import math
18
+ import re
19
+ import sys
20
+
21
+
22
+ def norm_title(t: str) -> str:
23
+ return re.sub(r"[^a-z0-9]", "", t.lower())
24
+
25
+
26
+ def make_key(paper: dict) -> str:
27
+ if paper.get("paperId"):
28
+ return f"s2:{paper['paperId']}"
29
+ ext = paper.get("externalIds") or {}
30
+ if ext.get("DOI"):
31
+ return f"doi:{ext['DOI'].lower()}"
32
+ if ext.get("ArXiv"):
33
+ # strip version suffix if any
34
+ a = ext["ArXiv"].split("v")[0] if "v" in ext["ArXiv"][-3:] else ext["ArXiv"]
35
+ return f"arxiv:{a.lower()}"
36
+ title = paper.get("title", "")
37
+ return f"title:{norm_title(title)}"
38
+
39
+
40
+ def main() -> int:
41
+ p = argparse.ArgumentParser(description=__doc__)
42
+ p.add_argument("--in", dest="inp", required=True, help="Raw verified pool JSON")
43
+ p.add_argument("--out", required=True, help="Deduped citation_pool.json")
44
+ p.add_argument("--cutoff", help="Cutoff date YYYY-MM-DD (recorded in output)")
45
+ args = p.parse_args()
46
+
47
+ with open(args.inp) as f:
48
+ raw = json.load(f)
49
+
50
+ candidates = raw.get("papers") or raw.get("candidates") or []
51
+ if not candidates:
52
+ print("ERROR: input has neither 'papers' nor 'candidates' key", file=sys.stderr)
53
+ return 1
54
+
55
+ by_key: dict[str, dict] = {}
56
+ collisions: list[tuple[str, str]] = []
57
+ for c in candidates:
58
+ key = make_key(c)
59
+ if key in by_key:
60
+ existing = by_key[key]
61
+ score_new = c.get("match_score", 0)
62
+ score_old = existing.get("match_score", 0)
63
+ if score_new > score_old:
64
+ # merge discovered_for
65
+ merged = existing.get("discovered_for", []) + c.get("discovered_for", [])
66
+ c["discovered_for"] = list(dict.fromkeys(merged)) # preserve order, dedupe
67
+ by_key[key] = c
68
+ else:
69
+ merged = existing.get("discovered_for", []) + c.get("discovered_for", [])
70
+ existing["discovered_for"] = list(dict.fromkeys(merged))
71
+ collisions.append((key, c.get("title", "")))
72
+ else:
73
+ by_key[key] = c
74
+
75
+ deduped = list(by_key.values())
76
+ n = len(deduped)
77
+ min_cite = math.floor(0.9 * n)
78
+
79
+ out = {
80
+ "papers": deduped,
81
+ "min_cite_paper_count": min_cite,
82
+ "n_total": n,
83
+ "n_collisions_merged": len(collisions),
84
+ }
85
+ if args.cutoff:
86
+ out["cutoff_date"] = args.cutoff
87
+
88
+ with open(args.out, "w") as f:
89
+ json.dump(out, f, indent=2, ensure_ascii=False)
90
+
91
+ print(f"OK: {len(candidates)} candidates → {n} unique papers")
92
+ print(f" {len(collisions)} duplicates merged")
93
+ print(f" min_cite_paper_count (≥90%): {min_cite}")
94
+ return 0
95
+
96
+
97
+ if __name__ == "__main__":
98
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/exa_search.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ exa_search.py — Optional Exa (https://exa.ai) backend for the literature
4
+ review agent's Phase 1 (parallel candidate discovery) step.
5
+
6
+ Exa is a search engine optimized for finding academic papers and other
7
+ high-quality content. It is OPTIONAL — the literature-review-agent works
8
+ fine with any host coding agent's native web search tool. Use Exa only if:
9
+
10
+ - Your host has no built-in web search (e.g., Aider, OpenCode, generic
11
+ CLI agents).
12
+ - You want a research-paper-focused search backend with better
13
+ signal-to-noise than general web search.
14
+ - You're running the pipeline in batch / non-interactive mode and want
15
+ a deterministic, scriptable backend.
16
+
17
+ This helper reads EXA_API_KEY from the environment. The key is YOUR
18
+ responsibility to provide; this repo never commits one. Get a key at
19
+ https://dashboard.exa.ai/.
20
+
21
+ Usage:
22
+ export EXA_API_KEY="your-key-here"
23
+ python exa_search.py --query "Sparse attention long context" --num-results 15
24
+ python exa_search.py --query "..." --raw # full JSON
25
+ python exa_search.py --query "..." --discovered-for "related_work[2.1]"
26
+
27
+ Default output: JSON candidates in the literature-review-agent format, ready
28
+ to be merged into raw_candidates.json before Phase 2 verification.
29
+
30
+ Exit codes:
31
+ 0 query succeeded
32
+ 1 EXA_API_KEY missing, HTTP error, network error, or empty results
33
+ """
34
+ import argparse
35
+ import json
36
+ import os
37
+ import sys
38
+ import urllib.error
39
+ import urllib.request
40
+
41
+ EXA_ENDPOINT = "https://api.exa.ai/search"
42
+ DEFAULT_NUM = 10
43
+ MAX_NUM = 20 # the user explicitly asked for a 10-20 range
44
+ SNIPPET_CAP = 1500
45
+
46
+
47
+ def search(query: str, num_results: int, category: str | None, highlight_max_chars: int) -> dict:
48
+ api_key = os.environ.get("EXA_API_KEY")
49
+ if not api_key:
50
+ print(
51
+ "ERROR: EXA_API_KEY environment variable not set.\n"
52
+ "Get a key at https://dashboard.exa.ai/ and run:\n"
53
+ ' export EXA_API_KEY="your-key-here"\n'
54
+ "Then retry. The literature-review-agent also works without\n"
55
+ "Exa — see references/discovery-pipeline.md for the default\n"
56
+ "host-native web search path.",
57
+ file=sys.stderr,
58
+ )
59
+ sys.exit(1)
60
+
61
+ body: dict = {
62
+ "query": query,
63
+ "numResults": num_results,
64
+ "type": "auto",
65
+ "contents": {"highlights": {"maxCharacters": highlight_max_chars}},
66
+ }
67
+ if category:
68
+ body["category"] = category
69
+
70
+ req = urllib.request.Request(
71
+ EXA_ENDPOINT,
72
+ data=json.dumps(body).encode("utf-8"),
73
+ headers={
74
+ "content-type": "application/json",
75
+ "x-api-key": api_key,
76
+ },
77
+ method="POST",
78
+ )
79
+ try:
80
+ with urllib.request.urlopen(req, timeout=30) as resp:
81
+ return json.loads(resp.read().decode("utf-8"))
82
+ except urllib.error.HTTPError as e:
83
+ body_text = e.read().decode("utf-8", errors="replace")[:500]
84
+ print(f"ERROR: Exa HTTP {e.code}: {body_text}", file=sys.stderr)
85
+ sys.exit(1)
86
+ except urllib.error.URLError as e:
87
+ print(f"ERROR: Exa network error: {e.reason}", file=sys.stderr)
88
+ sys.exit(1)
89
+
90
+
91
+ def normalize(exa_response: dict, discovered_for: list[str]) -> list[dict]:
92
+ """Convert Exa results into the literature-review-agent candidate format."""
93
+ candidates: list[dict] = []
94
+ for r in exa_response.get("results", []):
95
+ title = (r.get("title") or "").strip()
96
+ url = r.get("url") or r.get("id") or ""
97
+ highlights = r.get("highlights") or []
98
+ snippet = " ".join(h.strip() for h in highlights)[:SNIPPET_CAP]
99
+ candidates.append(
100
+ {
101
+ "title": title,
102
+ "snippet": snippet,
103
+ "source_url": url,
104
+ "discovered_for": list(discovered_for),
105
+ "_exa_id": r.get("id"),
106
+ "_exa_published_date": r.get("publishedDate"),
107
+ }
108
+ )
109
+ return candidates
110
+
111
+
112
+ def main() -> int:
113
+ p = argparse.ArgumentParser(
114
+ description=__doc__,
115
+ formatter_class=argparse.RawDescriptionHelpFormatter,
116
+ )
117
+ p.add_argument("--query", required=True, help="Search query")
118
+ p.add_argument(
119
+ "--num-results",
120
+ type=int,
121
+ default=DEFAULT_NUM,
122
+ help=f"Number of results to fetch " f"(default {DEFAULT_NUM}, clamped to [1, {MAX_NUM}])",
123
+ )
124
+ p.add_argument(
125
+ "--category",
126
+ default="research paper",
127
+ help='Exa category filter (default "research paper"; ' "pass an empty string to disable)",
128
+ )
129
+ p.add_argument(
130
+ "--highlight-chars",
131
+ type=int,
132
+ default=4000,
133
+ help="Max characters per highlight (default 4000)",
134
+ )
135
+ p.add_argument(
136
+ "--discovered-for",
137
+ default="intro",
138
+ help="Tag to attach to each candidate "
139
+ '(default "intro"). Use "related_work[2.1]" or '
140
+ "similar for cluster-specific queries so the "
141
+ "downstream citation_coverage gate can attribute "
142
+ "the citation to the right section.",
143
+ )
144
+ p.add_argument(
145
+ "--raw",
146
+ action="store_true",
147
+ help="Print the full Exa response JSON unmodified " "instead of normalized candidates",
148
+ )
149
+ args = p.parse_args()
150
+
151
+ n = max(1, min(MAX_NUM, args.num_results))
152
+ category = args.category or None
153
+
154
+ response = search(args.query, n, category, args.highlight_chars)
155
+ if not response.get("results"):
156
+ print(f"WARN: Exa returned 0 results for query: {args.query!r}", file=sys.stderr)
157
+ return 1
158
+
159
+ if args.raw:
160
+ json.dump(response, sys.stdout, indent=2, ensure_ascii=False)
161
+ else:
162
+ candidates = normalize(response, [args.discovered_for])
163
+ json.dump({"candidates": candidates}, sys.stdout, indent=2, ensure_ascii=False)
164
+ sys.stdout.write("\n")
165
+ return 0
166
+
167
+
168
+ if __name__ == "__main__":
169
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/levenshtein_match.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ levenshtein_match.py — Fuzzy title match for citation verification.
4
+
5
+ Implements the paper's Rule 1 (App. D.3): a candidate paper passes only if
6
+ its title's Levenshtein ratio against the Semantic Scholar hit's title is
7
+ strictly greater than 70.
8
+
9
+ Includes a substring-bypass safety net for short candidate titles (the
10
+ Linformer false-negative case): if the candidate is < 4 words and is
11
+ contained as a substring in the S2 hit's title, return 100.
12
+
13
+ Exit code is always 0; the integer ratio is printed to stdout. The caller
14
+ parses it and decides whether to discard.
15
+
16
+ Usage:
17
+ python levenshtein_match.py --candidate "..." --found "..."
18
+ python levenshtein_match.py --candidate "..." --found "..." --substring-bypass
19
+ """
20
+ import argparse
21
+ import re
22
+ import sys
23
+
24
+ try:
25
+ import Levenshtein
26
+ except ImportError:
27
+ print(
28
+ "ERROR: python-Levenshtein required. Install with: pip install python-Levenshtein",
29
+ file=sys.stderr,
30
+ )
31
+ sys.exit(2)
32
+
33
+
34
+ def normalize(s: str) -> str:
35
+ s = s.lower().strip()
36
+ s = re.sub(r"[^a-z0-9\s]", " ", s)
37
+ s = re.sub(r"\s+", " ", s)
38
+ return s
39
+
40
+
41
+ def ratio(a: str, b: str, substring_bypass: bool = False) -> int:
42
+ na, nb = normalize(a), normalize(b)
43
+ r = int(round(Levenshtein.ratio(na, nb) * 100))
44
+ if substring_bypass and len(na.split()) < 4:
45
+ if na in nb:
46
+ return max(r, 95)
47
+ return r
48
+
49
+
50
+ def main() -> int:
51
+ p = argparse.ArgumentParser(description=__doc__)
52
+ p.add_argument(
53
+ "--candidate", required=True, help="The original candidate title (from web search)"
54
+ )
55
+ p.add_argument("--found", required=True, help="The title returned by Semantic Scholar")
56
+ p.add_argument(
57
+ "--substring-bypass",
58
+ action="store_true",
59
+ help="Bump short-candidate substring matches to 95",
60
+ )
61
+ p.add_argument(
62
+ "--threshold", type=int, default=70, help="Print PASS/FAIL alongside the ratio (default 70)"
63
+ )
64
+ args = p.parse_args()
65
+
66
+ r = ratio(args.candidate, args.found, args.substring_bypass)
67
+ verdict = "PASS" if r > args.threshold else "FAIL"
68
+ print(f"{r} {verdict}")
69
+ return 0
70
+
71
+
72
+ if __name__ == "__main__":
73
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/pre_dedup_candidates.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ pre_dedup_candidates.py — Deduplicate Phase 1 raw candidates by normalized
4
+ title before Phase 2 Semantic Scholar verification.
5
+
6
+ Multiple search queries in Phase 1 often return the same papers. Verifying
7
+ duplicates wastes S2 quota (1 QPS hard cap) and adds 30-40% unnecessary
8
+ wall-time. This script removes obvious duplicates — same paper found via
9
+ multiple queries — before the sequential verification loop begins.
10
+
11
+ Dedup strategy (in order of preference):
12
+ 1. Exact arXiv ID match extracted from source URL or snippet.
13
+ 2. Levenshtein ratio >= 92 on normalized titles (high threshold to avoid
14
+ false collisions between similarly-named papers).
15
+
16
+ When two candidates are considered the same, we keep the one that appeared
17
+ earlier in the list and merge their `discovered_for` attribution tags so
18
+ the surviving entry is credited to all originating queries.
19
+
20
+ Usage:
21
+ python pre_dedup_candidates.py \\
22
+ --in workspace/raw_candidates.json \\
23
+ --out workspace/deduped_candidates.json
24
+
25
+ Input JSON shape:
26
+ {"candidates": [{"title": "...", "url": "...", "snippet": "...",
27
+ "discovered_for": ["intro.1"]}, ...]}
28
+ OR a bare list.
29
+ """
30
+ import argparse
31
+ import json
32
+ import re
33
+ import sys
34
+
35
+ ARXIV_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})", re.IGNORECASE)
36
+
37
+
38
+ def norm_title(t: str) -> str:
39
+ t = re.sub(r"[^a-z0-9 ]", " ", t.lower())
40
+ return " ".join(t.split())
41
+
42
+
43
+ def levenshtein_ratio(a: str, b: str) -> float:
44
+ if not a and not b:
45
+ return 100.0
46
+ if not a or not b:
47
+ return 0.0
48
+ la, lb = len(a), len(b)
49
+ if la < lb:
50
+ a, b = b, a
51
+ la, lb = lb, la
52
+ prev = list(range(lb + 1))
53
+ for i, ca in enumerate(a):
54
+ curr = [i + 1]
55
+ for j, cb in enumerate(b):
56
+ cost = 0 if ca == cb else 1
57
+ curr.append(min(prev[j + 1] + 1, curr[j] + 1, prev[j] + cost))
58
+ prev = curr
59
+ dist = prev[lb]
60
+ return (1.0 - dist / max(la, lb)) * 100.0
61
+
62
+
63
+ def extract_arxiv_id(candidate: dict) -> str | None:
64
+ for text in (candidate.get("url", ""), candidate.get("snippet", "")):
65
+ m = ARXIV_RE.search(text)
66
+ if m:
67
+ return m.group(1)
68
+ return None
69
+
70
+
71
+ def make_exact_key(candidate: dict) -> str:
72
+ """Canonical key: arXiv ID if extractable, else normalized title."""
73
+ aid = extract_arxiv_id(candidate)
74
+ if aid:
75
+ return f"arxiv:{aid}"
76
+ return f"title:{norm_title(candidate.get('title', ''))}"
77
+
78
+
79
+ def merge_discovered_for(a: dict, b: dict) -> list:
80
+ df_a = a.get("discovered_for") or []
81
+ df_b = b.get("discovered_for") or []
82
+ return list(dict.fromkeys(df_a + df_b))
83
+
84
+
85
+ def dedup(candidates: list[dict], title_ratio_threshold: float = 92.0) -> list[dict]:
86
+ # Pass 1: exact key dedup (arXiv ID or identical normalized title)
87
+ by_key: dict[str, dict] = {}
88
+ for c in candidates:
89
+ key = make_exact_key(c)
90
+ if key in by_key:
91
+ by_key[key]["discovered_for"] = merge_discovered_for(by_key[key], c)
92
+ else:
93
+ by_key[key] = dict(c)
94
+
95
+ deduped = list(by_key.values())
96
+
97
+ # Pass 2: fuzzy title dedup — O(n²) but n is ~50-100 candidates max
98
+ normed = [norm_title(c.get("title", "")) for c in deduped]
99
+ drop: set[int] = set()
100
+ for i in range(len(deduped)):
101
+ if i in drop:
102
+ continue
103
+ for j in range(i + 1, len(deduped)):
104
+ if j in drop:
105
+ continue
106
+ if levenshtein_ratio(normed[i], normed[j]) >= title_ratio_threshold:
107
+ deduped[i]["discovered_for"] = merge_discovered_for(deduped[i], deduped[j])
108
+ drop.add(j)
109
+
110
+ return [c for idx, c in enumerate(deduped) if idx not in drop]
111
+
112
+
113
+ def main() -> int:
114
+ p = argparse.ArgumentParser(description=__doc__)
115
+ p.add_argument("--in", dest="inp", required=True, help="Raw Phase 1 candidates JSON")
116
+ p.add_argument("--out", required=True, help="Deduped candidates JSON")
117
+ p.add_argument(
118
+ "--title-ratio",
119
+ type=float,
120
+ default=92.0,
121
+ help="Levenshtein ratio threshold for fuzzy title match (default: 92)",
122
+ )
123
+ args = p.parse_args()
124
+
125
+ with open(args.inp) as f:
126
+ raw = json.load(f)
127
+
128
+ if isinstance(raw, list):
129
+ candidates = raw
130
+ else:
131
+ candidates = raw.get("candidates") or raw.get("papers") or []
132
+
133
+ if not isinstance(candidates, list):
134
+ print("ERROR: input must be a JSON array or object with 'candidates' key", file=sys.stderr)
135
+ return 1
136
+
137
+ before = len(candidates)
138
+ result = dedup(candidates, title_ratio_threshold=args.title_ratio)
139
+ after = len(result)
140
+ removed = before - after
141
+
142
+ out_obj = {
143
+ "candidates": result,
144
+ "n_before_dedup": before,
145
+ "n_after_dedup": after,
146
+ "n_removed": removed,
147
+ }
148
+ with open(args.out, "w") as f:
149
+ json.dump(out_obj, f, indent=2, ensure_ascii=False)
150
+
151
+ print(f"OK: {before} candidates → {after} unique ({removed} duplicates removed)")
152
+ return 0
153
+
154
+
155
+ if __name__ == "__main__":
156
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/s2_cache.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ s2_cache.py — Persistent Semantic Scholar verification cache.
4
+
5
+ Problem: Phase 2 verification is throttled to 1 QPS. If a pipeline run
6
+ fails partway through (gate error, network timeout, interrupted session),
7
+ re-running wastes the full S2 wait time again on already-verified papers.
8
+
9
+ Solution: a flat JSON cache at workspace/cache/s2_cache.json. On a cache
10
+ HIT the script emits the stored response and exits 0 so the caller can skip
11
+ the live S2 request. On a cache MISS it exits 1. After a live request the
12
+ caller stores the result with --store.
13
+
14
+ The cache key is derived from the normalized query title (lowercase,
15
+ alphanumeric only) so minor whitespace differences still hit.
16
+
17
+ Usage:
18
+
19
+ CHECK mode — exits 0 + prints JSON if cached, else exits 1:
20
+ python s2_cache.py --cache workspace/cache/s2_cache.json \\
21
+ --check "Attention Is All You Need"
22
+
23
+ STORE mode — write a response into the cache:
24
+ python s2_cache.py --cache workspace/cache/s2_cache.json \\
25
+ --store "Attention Is All You Need" \\
26
+ --response '{"paperId": "...", "title": "..."}'
27
+
28
+ STATS mode — print cache size and hit rate summary:
29
+ python s2_cache.py --cache workspace/cache/s2_cache.json --stats
30
+ """
31
+ import argparse
32
+ import json
33
+ import os
34
+ import re
35
+ import sys
36
+
37
+
38
+ def norm_key(title: str) -> str:
39
+ """Lowercase, alphanumeric-only cache key."""
40
+ return re.sub(r"[^a-z0-9]", "", title.lower())
41
+
42
+
43
+ def load_cache(path: str) -> dict:
44
+ if os.path.isfile(path):
45
+ with open(path) as f:
46
+ try:
47
+ return json.load(f)
48
+ except json.JSONDecodeError:
49
+ return {}
50
+ return {}
51
+
52
+
53
+ def save_cache(path: str, cache: dict) -> None:
54
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
55
+ with open(path, "w") as f:
56
+ json.dump(cache, f, indent=2, ensure_ascii=False)
57
+
58
+
59
+ def main() -> int:
60
+ p = argparse.ArgumentParser(description=__doc__)
61
+ p.add_argument("--cache", required=True, help="Path to cache JSON file")
62
+
63
+ mode = p.add_mutually_exclusive_group(required=True)
64
+ mode.add_argument(
65
+ "--check",
66
+ metavar="TITLE",
67
+ help="Check for title; exit 0 + print JSON if found, else exit 1",
68
+ )
69
+ mode.add_argument(
70
+ "--store", metavar="TITLE", help="Store a response for TITLE (requires --response)"
71
+ )
72
+ mode.add_argument("--stats", action="store_true", help="Print cache statistics")
73
+
74
+ p.add_argument(
75
+ "--response", metavar="JSON", help="S2 response JSON to store (used with --store)"
76
+ )
77
+ args = p.parse_args()
78
+
79
+ cache = load_cache(args.cache)
80
+
81
+ if args.stats:
82
+ print(f"Cache file : {args.cache}")
83
+ print(f"Entries : {len(cache)}")
84
+ if cache:
85
+ print("Sample keys:", list(cache.keys())[:5])
86
+ return 0
87
+
88
+ if args.check:
89
+ key = norm_key(args.check)
90
+ if key in cache:
91
+ print(json.dumps(cache[key]))
92
+ return 0 # HIT
93
+ return 1 # MISS
94
+
95
+ # --store mode
96
+ if not args.response:
97
+ print("ERROR: --store requires --response", file=sys.stderr)
98
+ return 2
99
+ try:
100
+ response = json.loads(args.response)
101
+ except json.JSONDecodeError as e:
102
+ print(f"ERROR: invalid JSON in --response: {e}", file=sys.stderr)
103
+ return 2
104
+
105
+ key = norm_key(args.store)
106
+ cache[key] = response
107
+ save_cache(args.cache, cache)
108
+ print(f"OK: cached '{args.store}' → key '{key}' ({len(cache)} total entries)")
109
+ return 0
110
+
111
+
112
+ if __name__ == "__main__":
113
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/s2_search.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ s2_search.py — Semantic Scholar title-search helper for Phase 2 verification.
4
+
5
+ Queries the Semantic Scholar Graph API for a paper by title and returns the
6
+ top candidate hits as JSON. Used by the literature-review-agent to verify
7
+ each candidate from Phase 1 before adding it to citation_pool.json.
8
+
9
+ API key (optional):
10
+ If SEMANTIC_SCHOLAR_API_KEY is set in the environment the key is forwarded
11
+ via the ``x-api-key`` header, which raises the rate limit from ~100 req/5 min
12
+ (unauthenticated) to 1 req/s sustained with higher burst headroom.
13
+ If the variable is absent the script falls back to the public unauthenticated
14
+ endpoint — the pipeline works fine without a key; just keep to ≤1 QPS.
15
+
16
+ Get a free key at: https://api.semanticscholar.org/
17
+ Then export it once before running the pipeline:
18
+ export SEMANTIC_SCHOLAR_API_KEY="your-key-here"
19
+
20
+ Usage:
21
+ # check for key and search
22
+ python s2_search.py --query "Attention is All You Need"
23
+
24
+ # request more hits and extra fields
25
+ python s2_search.py --query "BERT pre-training" --limit 10 \\
26
+ --fields title,abstract,year,authors,venue,externalIds,citationCount
27
+
28
+ # pretty-print raw S2 JSON
29
+ python s2_search.py --query "GPT-4 technical report" --raw
30
+
31
+ Exit codes:
32
+ 0 at least one result returned
33
+ 1 HTTP error, network error, or zero results
34
+ 2 usage error (bad arguments)
35
+ """
36
+ import argparse
37
+ import json
38
+ import os
39
+ import sys
40
+ import time
41
+ import urllib.error
42
+ import urllib.parse
43
+ import urllib.request
44
+
45
+ S2_BASE = "https://api.semanticscholar.org/graph/v1"
46
+ DEFAULT_FIELDS = "title,abstract,year,authors,venue,externalIds"
47
+ DEFAULT_LIMIT = 5
48
+ MAX_LIMIT = 100
49
+ _RETRY_SLEEP = 5 # seconds to wait after a 429 before retrying
50
+
51
+
52
+ def _build_headers() -> dict:
53
+ headers = {"Accept": "application/json"}
54
+ api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()
55
+ if api_key:
56
+ headers["x-api-key"] = api_key
57
+ return headers
58
+
59
+
60
+ def search(query: str, limit: int, fields: str, retries: int = 3) -> dict:
61
+ """
62
+ Call /paper/search and return the parsed JSON response.
63
+
64
+ Raises SystemExit on unrecoverable errors so the caller (or CLI) gets a
65
+ clean non-zero exit code.
66
+ """
67
+ params = urllib.parse.urlencode(
68
+ {
69
+ "query": query,
70
+ "limit": limit,
71
+ "fields": fields,
72
+ }
73
+ )
74
+ url = f"{S2_BASE}/paper/search?{params}"
75
+ headers = _build_headers()
76
+
77
+ for attempt in range(1, retries + 1):
78
+ req = urllib.request.Request(url, headers=headers, method="GET")
79
+ try:
80
+ with urllib.request.urlopen(req, timeout=30) as resp:
81
+ return json.loads(resp.read().decode("utf-8"))
82
+ except urllib.error.HTTPError as exc:
83
+ if exc.code == 429:
84
+ if attempt < retries:
85
+ print(
86
+ f"WARN: S2 rate-limited (429). Sleeping {_RETRY_SLEEP}s "
87
+ f"before retry {attempt + 1}/{retries}.",
88
+ file=sys.stderr,
89
+ )
90
+ time.sleep(_RETRY_SLEEP)
91
+ continue
92
+ print(
93
+ "ERROR: S2 rate-limited (429) and retries exhausted.\n"
94
+ "Tip: set SEMANTIC_SCHOLAR_API_KEY to get a higher rate limit.\n"
95
+ " See https://api.semanticscholar.org/ for a free key.",
96
+ file=sys.stderr,
97
+ )
98
+ sys.exit(1)
99
+ if exc.code == 404:
100
+ # not found — return an empty result set (caller handles this)
101
+ return {"total": 0, "data": []}
102
+ if exc.code in (500, 502, 503):
103
+ if attempt < retries:
104
+ print(
105
+ f"WARN: S2 server error ({exc.code}). Sleeping 30s before "
106
+ f"retry {attempt + 1}/{retries}.",
107
+ file=sys.stderr,
108
+ )
109
+ time.sleep(30)
110
+ continue
111
+ print(
112
+ f"ERROR: S2 server error ({exc.code}) after {retries} attempts.",
113
+ file=sys.stderr,
114
+ )
115
+ sys.exit(1)
116
+ body = exc.read().decode("utf-8", errors="replace")[:400]
117
+ print(f"ERROR: S2 HTTP {exc.code}: {body}", file=sys.stderr)
118
+ sys.exit(1)
119
+ except urllib.error.URLError as exc:
120
+ print(f"ERROR: Network error reaching Semantic Scholar: {exc.reason}", file=sys.stderr)
121
+ sys.exit(1)
122
+
123
+ # should never reach here
124
+ sys.exit(1)
125
+
126
+
127
+ def main() -> int:
128
+ p = argparse.ArgumentParser(
129
+ description=__doc__,
130
+ formatter_class=argparse.RawDescriptionHelpFormatter,
131
+ )
132
+ p.add_argument(
133
+ "--query",
134
+ required=True,
135
+ help="Paper title (or search query) to look up on Semantic Scholar",
136
+ )
137
+ p.add_argument(
138
+ "--limit",
139
+ type=int,
140
+ default=DEFAULT_LIMIT,
141
+ help=f"Max hits to return (default {DEFAULT_LIMIT}, max {MAX_LIMIT})",
142
+ )
143
+ p.add_argument(
144
+ "--fields",
145
+ default=DEFAULT_FIELDS,
146
+ help=f"Comma-separated S2 fields to request (default: {DEFAULT_FIELDS})",
147
+ )
148
+ p.add_argument(
149
+ "--raw",
150
+ action="store_true",
151
+ help="Print the full S2 JSON response unmodified instead of normalized output",
152
+ )
153
+ p.add_argument(
154
+ "--check-key",
155
+ action="store_true",
156
+ help="Print whether SEMANTIC_SCHOLAR_API_KEY is set and exit (no network call)",
157
+ )
158
+ args = p.parse_args()
159
+
160
+ if args.check_key:
161
+ key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()
162
+ if key:
163
+ masked = key[:4] + "..." + key[-4:] if len(key) > 8 else "****"
164
+ print(
165
+ f"SEMANTIC_SCHOLAR_API_KEY is set ({masked}). "
166
+ "Authenticated mode: higher rate limits."
167
+ )
168
+ else:
169
+ print(
170
+ "SEMANTIC_SCHOLAR_API_KEY is NOT set. "
171
+ "Unauthenticated mode: ~100 req/5 min, keep to ≤1 QPS.\n"
172
+ "To enable higher rate limits:\n"
173
+ " 1. Get a free key at https://api.semanticscholar.org/\n"
174
+ ' 2. export SEMANTIC_SCHOLAR_API_KEY="your-key-here"'
175
+ )
176
+ return 0
177
+
178
+ limit = max(1, min(MAX_LIMIT, args.limit))
179
+ response = search(args.query, limit, args.fields)
180
+
181
+ if args.raw:
182
+ json.dump(response, sys.stdout, indent=2, ensure_ascii=False)
183
+ sys.stdout.write("\n")
184
+ return 0
185
+
186
+ data = response.get("data") or []
187
+ if not data:
188
+ print(
189
+ f"WARN: Semantic Scholar returned 0 results for query: {args.query!r}",
190
+ file=sys.stderr,
191
+ )
192
+ json.dump({"total": 0, "data": []}, sys.stdout, indent=2)
193
+ sys.stdout.write("\n")
194
+ return 1
195
+
196
+ # Emit normalized output (subset of fields used by pipeline)
197
+ out = {
198
+ "total": response.get("total", len(data)),
199
+ "authenticated": bool(os.environ.get("SEMANTIC_SCHOLAR_API_KEY", "").strip()),
200
+ "data": data,
201
+ }
202
+ json.dump(out, sys.stdout, indent=2, ensure_ascii=False)
203
+ sys.stdout.write("\n")
204
+ return 0
205
+
206
+
207
+ if __name__ == "__main__":
208
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/sync_keys.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ r"""
3
+ sync_keys.py — Synchronize citation keys in a .tex file with the canonical
4
+ bibtex_key values stored in citation_pool.json.
5
+
6
+ Problem: The Literature Review Agent writes cite keys in its own format
7
+ (e.g. 'lewis2020rag'), while bibtex_format.py generates canonical keys from
8
+ author + year + first-significant-title-word (e.g. 'lewis2020retrievalaugmented').
9
+ After running bibtex_format.py these two sources are out of sync, causing the
10
+ citation_coverage gate to fail (it looks for \cite{canonical_key} in the .tex).
11
+
12
+ This script reads the 'key' -> 'bibtex_key' mapping from citation_pool.json
13
+ and performs a targeted substitution inside \cite{}, \citep{}, \citet{}
14
+ commands in the target .tex file. It handles multi-key citations like
15
+ \cite{a,b,c} correctly.
16
+
17
+ Run this immediately after bibtex_format.py, before Step 4 (Section Writing).
18
+
19
+ Usage:
20
+ python sync_keys.py \
21
+ --pool workspace/citation_pool.json \
22
+ --tex workspace/drafts/intro_relwork.tex \
23
+ --inplace
24
+
25
+ # Without --inplace: prints updated content to stdout (safe preview mode).
26
+ """
27
+ import argparse
28
+ import json
29
+ import re
30
+ import sys
31
+
32
+ # Matches \cite, \citep, \citet, \citealt, \citealp, \citeauthor, \citeyear,
33
+ # starred variants like \cite*, and the optional [prenote][postnote] args.
34
+ CITE_RE = re.compile(
35
+ r"(\\cite[a-zA-Z*]*)" # command
36
+ r"(?:\[[^\]]*\])*" # optional bracket args (prenote/postnote)
37
+ r"\{([^}]+)\}" # required brace arg with keys
38
+ )
39
+
40
+
41
+ def build_key_map(pool: dict) -> dict[str, str]:
42
+ """Return {agent_key: bibtex_key} for every paper where they differ."""
43
+ key_map: dict[str, str] = {}
44
+ for paper in pool.get("papers", []):
45
+ old = paper.get("key")
46
+ new = paper.get("bibtex_key")
47
+ if old and new and old != new:
48
+ key_map[old] = new
49
+ return key_map
50
+
51
+
52
+ def replace_keys(content: str, key_map: dict[str, str]) -> tuple[str, int]:
53
+ if not key_map:
54
+ return content, 0
55
+
56
+ n_replaced = 0
57
+
58
+ def replacer(m: re.Match) -> str:
59
+ nonlocal n_replaced
60
+ cmd = m.group(1)
61
+ keys_str = m.group(2)
62
+ keys = [k.strip() for k in keys_str.split(",")]
63
+ new_keys: list[str] = []
64
+ for k in keys:
65
+ if k in key_map:
66
+ new_keys.append(key_map[k])
67
+ n_replaced += 1
68
+ else:
69
+ new_keys.append(k)
70
+ # Reconstruct original bracket args (they were consumed by the regex
71
+ # but we don't need to preserve them specially — re-emit as matched)
72
+ full_match = m.group(0)
73
+ # Rebuild: command + everything between command and { + new keys
74
+ bracket_part = full_match[len(cmd) : full_match.index("{")]
75
+ return f"{cmd}{bracket_part}{{{', '.join(new_keys)}}}"
76
+
77
+ updated = CITE_RE.sub(replacer, content)
78
+ return updated, n_replaced
79
+
80
+
81
+ def main() -> int:
82
+ p = argparse.ArgumentParser(description=__doc__)
83
+ p.add_argument("--pool", required=True, help="citation_pool.json")
84
+ p.add_argument("--tex", required=True, help="Target .tex file to update")
85
+ p.add_argument(
86
+ "--inplace", action="store_true", help="Overwrite --tex in place (default: print to stdout)"
87
+ )
88
+ args = p.parse_args()
89
+
90
+ with open(args.pool) as f:
91
+ pool = json.load(f)
92
+ key_map = build_key_map(pool)
93
+
94
+ if not key_map:
95
+ print("OK: no key differences in citation_pool.json — nothing to sync")
96
+ return 0
97
+
98
+ print(f"Key map ({len(key_map)} substitutions):")
99
+ for old, new in key_map.items():
100
+ print(f" {old} → {new}")
101
+
102
+ with open(args.tex) as f:
103
+ content = f.read()
104
+
105
+ updated, n = replace_keys(content, key_map)
106
+
107
+ if args.inplace:
108
+ with open(args.tex, "w") as f:
109
+ f.write(updated)
110
+ print(f"OK: {n} citation key(s) updated in {args.tex}")
111
+ else:
112
+ sys.stdout.write(updated)
113
+ print(f"\n# sync_keys: {n} substitution(s) would be made", file=sys.stderr)
114
+
115
+ return 0
116
+
117
+
118
+ if __name__ == "__main__":
119
+ sys.exit(main())
.scider/skills/literature-review-agent/scripts/validate_pool.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ validate_pool.py — Validate and auto-fix citation_pool.json before it is
4
+ passed to bibtex_format.py or the Section Writing Agent.
5
+
6
+ Catches the two most common schema errors produced by the Literature Review
7
+ Agent and fixes them in place with --fix.
8
+
9
+ Error 1 — Authors as plain strings (WRONG format for bibtex_format.py):
10
+ WRONG: "authors": ["Alice Smith", "Bob Jones"]
11
+ CORRECT: "authors": [{"name": "Alice Smith"}, {"name": "Bob Jones"}]
12
+
13
+ Error 2 — Missing required fields (title, year). These cause bibtex_format.py
14
+ to emit incomplete entries. Reported as errors, not auto-fixed.
15
+
16
+ Also checks that the pool has the top-level keys that downstream scripts
17
+ expect: "papers", "min_cite_paper_count".
18
+
19
+ Exit codes:
20
+ 0 Pool is valid (or was fully fixed with --fix)
21
+ 1 Unrecoverable errors remain (missing required fields, no papers)
22
+
23
+ Usage:
24
+ python validate_pool.py --pool workspace/citation_pool.json
25
+ python validate_pool.py --pool workspace/citation_pool.json --fix
26
+ """
27
+ import argparse
28
+ import json
29
+ import sys
30
+
31
+ REQUIRED_PAPER_FIELDS = ["title", "year"]
32
+ RECOMMENDED_PAPER_FIELDS = ["paperId", "abstract", "venue", "authors"]
33
+ REQUIRED_TOP_FIELDS = ["papers", "min_cite_paper_count"]
34
+
35
+
36
+ def validate_and_fix(pool: dict, fix: bool) -> tuple[list[str], list[str], int]:
37
+ """
38
+ Returns (errors, warnings, n_fixed).
39
+ If fix=True, mutates pool in place where possible.
40
+ """
41
+ errors: list[str] = []
42
+ warnings: list[str] = []
43
+ n_fixed = 0
44
+
45
+ # Top-level structure
46
+ for field in REQUIRED_TOP_FIELDS:
47
+ if field not in pool:
48
+ warnings.append(f"top-level field '{field}' missing — was dedupe_by_id.py run?")
49
+
50
+ papers = pool.get("papers", [])
51
+ if not papers:
52
+ errors.append("pool['papers'] is empty or missing")
53
+ return errors, warnings, n_fixed
54
+
55
+ for i, paper in enumerate(papers):
56
+ label = paper.get("title") or f"paper #{i}"
57
+
58
+ # --- Authors format check ---
59
+ authors = paper.get("authors")
60
+ if authors is not None:
61
+ if not isinstance(authors, list):
62
+ errors.append(f"[{label}] 'authors' must be a list, got {type(authors).__name__}")
63
+ elif authors:
64
+ if isinstance(authors[0], str):
65
+ if fix:
66
+ paper["authors"] = [{"name": a} for a in authors]
67
+ n_fixed += 1
68
+ else:
69
+ errors.append(
70
+ f"[{label}] authors are plain strings "
71
+ f'(e.g. "{authors[0]}") — run with --fix to auto-convert'
72
+ )
73
+ elif not isinstance(authors[0], dict):
74
+ errors.append(
75
+ f"[{label}] authors[0] is {type(authors[0]).__name__}, "
76
+ f"expected dict with 'name' key"
77
+ )
78
+
79
+ # --- Required fields ---
80
+ for field in REQUIRED_PAPER_FIELDS:
81
+ if not paper.get(field):
82
+ errors.append(f"[{label}] missing required field '{field}'")
83
+
84
+ # --- Recommended fields ---
85
+ for field in RECOMMENDED_PAPER_FIELDS:
86
+ if not paper.get(field):
87
+ warnings.append(f"[{label}] missing recommended field '{field}'")
88
+
89
+ return errors, warnings, n_fixed
90
+
91
+
92
+ def main() -> int:
93
+ p = argparse.ArgumentParser(description=__doc__)
94
+ p.add_argument("--pool", required=True, help="citation_pool.json path")
95
+ p.add_argument(
96
+ "--fix",
97
+ action="store_true",
98
+ help="Auto-fix recoverable errors (authors format) and write back",
99
+ )
100
+ p.add_argument("--quiet", action="store_true", help="Suppress warnings, only show errors")
101
+ args = p.parse_args()
102
+
103
+ with open(args.pool) as f:
104
+ pool = json.load(f)
105
+
106
+ errors, warnings, n_fixed = validate_and_fix(pool, fix=args.fix)
107
+
108
+ if not args.quiet:
109
+ for w in warnings:
110
+ print(f"WARN: {w}")
111
+
112
+ had_errors = bool(errors)
113
+ for e in errors:
114
+ print(f"ERROR: {e}", file=sys.stderr)
115
+
116
+ if had_errors and not args.fix:
117
+ print(
118
+ "\nTip: re-run with --fix to auto-correct recoverable issues (authors format).",
119
+ file=sys.stderr,
120
+ )
121
+ return 1
122
+
123
+ if n_fixed > 0:
124
+ with open(args.pool, "w") as f:
125
+ json.dump(pool, f, indent=2, ensure_ascii=False)
126
+ print(f"OK: {n_fixed} paper(s) auto-fixed and written back to {args.pool}")
127
+
128
+ n = len(pool.get("papers", []))
129
+ if not had_errors and n_fixed == 0:
130
+ print(f"OK: {n} papers validated — no errors")
131
+ elif n_fixed > 0 and not errors:
132
+ print(f"OK: {n} papers validated after auto-fix")
133
+
134
+ return (
135
+ 0
136
+ if (
137
+ not errors
138
+ or (args.fix and n_fixed > 0 and not [e for e in errors if "missing required" in e])
139
+ )
140
+ else 1
141
+ )
142
+
143
+
144
+ if __name__ == "__main__":
145
+ sys.exit(main())
.scider/skills/matplotlib/SKILL.md ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: matplotlib
3
+ description: Low-level plotting library for full customization. Use when you need fine-grained control over every plot element, novel plot types, or publication-quality PNG/PDF/SVG export. For quick statistical plots use seaborn.
4
+ allowed_agents: [experiment, native_coding]
5
+ ---
6
+
7
+ # Matplotlib
8
+
9
+ ## Overview
10
+
11
+ Matplotlib is Python's foundational visualization library for creating static, animated, and interactive plots. This skill provides guidance on using matplotlib effectively, covering both the pyplot interface (MATLAB-style) and the object-oriented API (Figure/Axes), along with best practices for creating publication-quality visualizations.
12
+
13
+ ## When to Use This Skill
14
+
15
+ This skill should be used when:
16
+ - Creating any type of plot or chart (line, scatter, bar, histogram, heatmap, contour, etc.)
17
+ - Generating scientific or statistical visualizations
18
+ - Customizing plot appearance (colors, styles, labels, legends)
19
+ - Creating multi-panel figures with subplots
20
+ - Exporting visualizations to various formats (PNG, PDF, SVG, etc.)
21
+ - Building interactive plots or animations
22
+ - Working with 3D visualizations
23
+ - Integrating plots into Jupyter notebooks or GUI applications
24
+
25
+ ## Core Concepts
26
+
27
+ ### The Matplotlib Hierarchy
28
+
29
+ Matplotlib uses a hierarchical structure of objects:
30
+
31
+ 1. **Figure** - The top-level container for all plot elements
32
+ 2. **Axes** - The actual plotting area where data is displayed (one Figure can contain multiple Axes)
33
+ 3. **Artist** - Everything visible on the figure (lines, text, ticks, etc.)
34
+ 4. **Axis** - The number line objects (x-axis, y-axis) that handle ticks and labels
35
+
36
+ ### Two Interfaces
37
+
38
+ **1. pyplot Interface (Implicit, MATLAB-style)**
39
+ ```python
40
+ import matplotlib.pyplot as plt
41
+
42
+ plt.plot([1, 2, 3, 4])
43
+ plt.ylabel('some numbers')
44
+ plt.show()
45
+ ```
46
+ - Convenient for quick, simple plots
47
+ - Maintains state automatically
48
+ - Good for interactive work and simple scripts
49
+
50
+ **2. Object-Oriented Interface (Explicit)**
51
+ ```python
52
+ import matplotlib.pyplot as plt
53
+
54
+ fig, ax = plt.subplots()
55
+ ax.plot([1, 2, 3, 4])
56
+ ax.set_ylabel('some numbers')
57
+ plt.show()
58
+ ```
59
+ - **Recommended for most use cases**
60
+ - More explicit control over figure and axes
61
+ - Better for complex figures with multiple subplots
62
+ - Easier to maintain and debug
63
+
64
+ ## Common Workflows
65
+
66
+ ### 1. Basic Plot Creation
67
+
68
+ **Single plot workflow:**
69
+ ```python
70
+ import matplotlib.pyplot as plt
71
+ import numpy as np
72
+
73
+ # Create figure and axes (OO interface - RECOMMENDED)
74
+ fig, ax = plt.subplots(figsize=(10, 6))
75
+
76
+ # Generate and plot data
77
+ x = np.linspace(0, 2*np.pi, 100)
78
+ ax.plot(x, np.sin(x), label='sin(x)')
79
+ ax.plot(x, np.cos(x), label='cos(x)')
80
+
81
+ # Customize
82
+ ax.set_xlabel('x')
83
+ ax.set_ylabel('y')
84
+ ax.set_title('Trigonometric Functions')
85
+ ax.legend()
86
+ ax.grid(True, alpha=0.3)
87
+
88
+ # Save and/or display
89
+ plt.savefig('plot.png', dpi=300, bbox_inches='tight')
90
+ plt.show()
91
+ ```
92
+
93
+ ### 2. Multiple Subplots
94
+
95
+ **Creating subplot layouts:**
96
+ ```python
97
+ # Method 1: Regular grid
98
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
99
+ axes[0, 0].plot(x, y1)
100
+ axes[0, 1].scatter(x, y2)
101
+ axes[1, 0].bar(categories, values)
102
+ axes[1, 1].hist(data, bins=30)
103
+
104
+ # Method 2: Mosaic layout (more flexible)
105
+ fig, axes = plt.subplot_mosaic([['left', 'right_top'],
106
+ ['left', 'right_bottom']],
107
+ figsize=(10, 8))
108
+ axes['left'].plot(x, y)
109
+ axes['right_top'].scatter(x, y)
110
+ axes['right_bottom'].hist(data)
111
+
112
+ # Method 3: GridSpec (maximum control)
113
+ from matplotlib.gridspec import GridSpec
114
+ fig = plt.figure(figsize=(12, 8))
115
+ gs = GridSpec(3, 3, figure=fig)
116
+ ax1 = fig.add_subplot(gs[0, :]) # Top row, all columns
117
+ ax2 = fig.add_subplot(gs[1:, 0]) # Bottom two rows, first column
118
+ ax3 = fig.add_subplot(gs[1:, 1:]) # Bottom two rows, last two columns
119
+ ```
120
+
121
+ ### 3. Plot Types and Use Cases
122
+
123
+ **Line plots** - Time series, continuous data, trends
124
+ ```python
125
+ ax.plot(x, y, linewidth=2, linestyle='--', marker='o', color='blue')
126
+ ```
127
+
128
+ **Scatter plots** - Relationships between variables, correlations
129
+ ```python
130
+ ax.scatter(x, y, s=sizes, c=colors, alpha=0.6, cmap='viridis')
131
+ ```
132
+
133
+ **Bar charts** - Categorical comparisons
134
+ ```python
135
+ ax.bar(categories, values, color='steelblue', edgecolor='black')
136
+ # For horizontal bars:
137
+ ax.barh(categories, values)
138
+ ```
139
+
140
+ **Histograms** - Distributions
141
+ ```python
142
+ ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
143
+ ```
144
+
145
+ **Heatmaps** - Matrix data, correlations
146
+ ```python
147
+ im = ax.imshow(matrix, cmap='coolwarm', aspect='auto')
148
+ plt.colorbar(im, ax=ax)
149
+ ```
150
+
151
+ **Contour plots** - 3D data on 2D plane
152
+ ```python
153
+ contour = ax.contour(X, Y, Z, levels=10)
154
+ ax.clabel(contour, inline=True, fontsize=8)
155
+ ```
156
+
157
+ **Box plots** - Statistical distributions
158
+ ```python
159
+ ax.boxplot([data1, data2, data3], labels=['A', 'B', 'C'])
160
+ ```
161
+
162
+ **Violin plots** - Distribution densities
163
+ ```python
164
+ ax.violinplot([data1, data2, data3], positions=[1, 2, 3])
165
+ ```
166
+
167
+ For comprehensive plot type examples and variations, refer to `references/plot_types.md`.
168
+
169
+ ### 4. Styling and Customization
170
+
171
+ **Color specification methods:**
172
+ - Named colors: `'red'`, `'blue'`, `'steelblue'`
173
+ - Hex codes: `'#FF5733'`
174
+ - RGB tuples: `(0.1, 0.2, 0.3)`
175
+ - Colormaps: `cmap='viridis'`, `cmap='plasma'`, `cmap='coolwarm'`
176
+
177
+ **Using style sheets:**
178
+ ```python
179
+ plt.style.use('seaborn-v0_8-darkgrid') # Apply predefined style
180
+ # Available styles: 'ggplot', 'bmh', 'fivethirtyeight', etc.
181
+ print(plt.style.available) # List all available styles
182
+ ```
183
+
184
+ **Customizing with rcParams:**
185
+ ```python
186
+ plt.rcParams['font.size'] = 12
187
+ plt.rcParams['axes.labelsize'] = 14
188
+ plt.rcParams['axes.titlesize'] = 16
189
+ plt.rcParams['xtick.labelsize'] = 10
190
+ plt.rcParams['ytick.labelsize'] = 10
191
+ plt.rcParams['legend.fontsize'] = 12
192
+ plt.rcParams['figure.titlesize'] = 18
193
+ ```
194
+
195
+ **Text and annotations:**
196
+ ```python
197
+ ax.text(x, y, 'annotation', fontsize=12, ha='center')
198
+ ax.annotate('important point', xy=(x, y), xytext=(x+1, y+1),
199
+ arrowprops=dict(arrowstyle='->', color='red'))
200
+ ```
201
+
202
+ For detailed styling options and colormap guidelines, see `references/styling_guide.md`.
203
+
204
+ ### 5. Saving Figures
205
+
206
+ **Export to various formats:**
207
+ ```python
208
+ # High-resolution PNG for presentations/papers
209
+ plt.savefig('figure.png', dpi=300, bbox_inches='tight', facecolor='white')
210
+
211
+ # Vector format for publications (scalable)
212
+ plt.savefig('figure.pdf', bbox_inches='tight')
213
+ plt.savefig('figure.svg', bbox_inches='tight')
214
+
215
+ # Transparent background
216
+ plt.savefig('figure.png', dpi=300, bbox_inches='tight', transparent=True)
217
+ ```
218
+
219
+ **Important parameters:**
220
+ - `dpi`: Resolution (300 for publications, 150 for web, 72 for screen)
221
+ - `bbox_inches='tight'`: Removes excess whitespace
222
+ - `facecolor='white'`: Ensures white background (useful for transparent themes)
223
+ - `transparent=True`: Transparent background
224
+
225
+ ### 6. Working with 3D Plots
226
+
227
+ ```python
228
+ from mpl_toolkits.mplot3d import Axes3D
229
+
230
+ fig = plt.figure(figsize=(10, 8))
231
+ ax = fig.add_subplot(111, projection='3d')
232
+
233
+ # Surface plot
234
+ ax.plot_surface(X, Y, Z, cmap='viridis')
235
+
236
+ # 3D scatter
237
+ ax.scatter(x, y, z, c=colors, marker='o')
238
+
239
+ # 3D line plot
240
+ ax.plot(x, y, z, linewidth=2)
241
+
242
+ # Labels
243
+ ax.set_xlabel('X Label')
244
+ ax.set_ylabel('Y Label')
245
+ ax.set_zlabel('Z Label')
246
+ ```
247
+
248
+ ## Best Practices
249
+
250
+ ### 1. Interface Selection
251
+ - **Use the object-oriented interface** (fig, ax = plt.subplots()) for production code
252
+ - Reserve pyplot interface for quick interactive exploration only
253
+ - Always create figures explicitly rather than relying on implicit state
254
+
255
+ ### 2. Figure Size and DPI
256
+ - Set figsize at creation: `fig, ax = plt.subplots(figsize=(10, 6))`
257
+ - Use appropriate DPI for output medium:
258
+ - Screen/notebook: 72-100 dpi
259
+ - Web: 150 dpi
260
+ - Print/publications: 300 dpi
261
+
262
+ ### 3. Layout Management
263
+ - Use `constrained_layout=True` or `tight_layout()` to prevent overlapping elements
264
+ - `fig, ax = plt.subplots(constrained_layout=True)` is recommended for automatic spacing
265
+
266
+ ### 4. Colormap Selection
267
+ - **Sequential** (viridis, plasma, inferno): Ordered data with consistent progression
268
+ - **Diverging** (coolwarm, RdBu): Data with meaningful center point (e.g., zero)
269
+ - **Qualitative** (tab10, Set3): Categorical/nominal data
270
+ - Avoid rainbow colormaps (jet) - they are not perceptually uniform
271
+
272
+ ### 5. Accessibility
273
+ - Use colorblind-friendly colormaps (viridis, cividis)
274
+ - Add patterns/hatching for bar charts in addition to colors
275
+ - Ensure sufficient contrast between elements
276
+ - Include descriptive labels and legends
277
+
278
+ ### 6. Performance
279
+ - For large datasets, use `rasterized=True` in plot calls to reduce file size
280
+ - Use appropriate data reduction before plotting (e.g., downsample dense time series)
281
+ - For animations, use blitting for better performance
282
+
283
+ ### 7. Code Organization
284
+ ```python
285
+ # Good practice: Clear structure
286
+ def create_analysis_plot(data, title):
287
+ """Create standardized analysis plot."""
288
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
289
+
290
+ # Plot data
291
+ ax.plot(data['x'], data['y'], linewidth=2)
292
+
293
+ # Customize
294
+ ax.set_xlabel('X Axis Label', fontsize=12)
295
+ ax.set_ylabel('Y Axis Label', fontsize=12)
296
+ ax.set_title(title, fontsize=14, fontweight='bold')
297
+ ax.grid(True, alpha=0.3)
298
+
299
+ return fig, ax
300
+
301
+ # Use the function
302
+ fig, ax = create_analysis_plot(my_data, 'My Analysis')
303
+ plt.savefig('analysis.png', dpi=300, bbox_inches='tight')
304
+ ```
305
+
306
+ ## Quick Reference Scripts
307
+
308
+ This skill includes helper scripts in the `scripts/` directory:
309
+
310
+ ### `plot_template.py`
311
+ Template script demonstrating various plot types with best practices. Use this as a starting point for creating new visualizations.
312
+
313
+ **Usage:**
314
+ ```bash
315
+ python scripts/plot_template.py
316
+ ```
317
+
318
+ ### `style_configurator.py`
319
+ Interactive utility to configure matplotlib style preferences and generate custom style sheets.
320
+
321
+ **Usage:**
322
+ ```bash
323
+ python scripts/style_configurator.py
324
+ ```
325
+
326
+ ## Detailed References
327
+
328
+ For comprehensive information, consult the reference documents:
329
+
330
+ - **`references/plot_types.md`** - Complete catalog of plot types with code examples and use cases
331
+ - **`references/styling_guide.md`** - Detailed styling options, colormaps, and customization
332
+ - **`references/api_reference.md`** - Core classes and methods reference
333
+ - **`references/common_issues.md`** - Troubleshooting guide for common problems
334
+
335
+ ## Integration with Other Tools
336
+
337
+ Matplotlib integrates well with:
338
+ - **NumPy/Pandas** - Direct plotting from arrays and DataFrames
339
+ - **Seaborn** - High-level statistical visualizations built on matplotlib
340
+ - **Jupyter** - Interactive plotting with `%matplotlib inline` or `%matplotlib widget`
341
+ - **GUI frameworks** - Embedding in Tkinter, Qt, wxPython applications
342
+
343
+ ## Common Gotchas
344
+
345
+ 1. **Overlapping elements**: Use `constrained_layout=True` or `tight_layout()`
346
+ 2. **State confusion**: Use OO interface to avoid pyplot state machine issues
347
+ 3. **Memory issues with many figures**: Close figures explicitly with `plt.close(fig)`
348
+ 4. **Font warnings**: Install fonts or suppress warnings with `plt.rcParams['font.sans-serif']`
349
+ 5. **DPI confusion**: Remember that figsize is in inches, not pixels: `pixels = dpi * inches`
350
+
351
+ ## Additional Resources
352
+
353
+ - Official documentation: https://matplotlib.org/
354
+ - Gallery: https://matplotlib.org/stable/gallery/index.html
355
+ - Cheatsheets: https://matplotlib.org/cheatsheets/
356
+ - Tutorials: https://matplotlib.org/stable/tutorials/index.html
.scider/skills/matplotlib/references/api_reference.md ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Matplotlib API Reference
2
+
3
+ This document provides a quick reference for the most commonly used matplotlib classes and methods.
4
+
5
+ ## Core Classes
6
+
7
+ ### Figure
8
+
9
+ The top-level container for all plot elements.
10
+
11
+ **Creation:**
12
+ ```python
13
+ fig = plt.figure(figsize=(10, 6), dpi=100, facecolor='white')
14
+ fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
15
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
16
+ ```
17
+
18
+ **Key Methods:**
19
+ - `fig.add_subplot(nrows, ncols, index)` - Add a subplot
20
+ - `fig.add_axes([left, bottom, width, height])` - Add axes at specific position
21
+ - `fig.savefig(filename, dpi=300, bbox_inches='tight')` - Save figure
22
+ - `fig.tight_layout()` - Adjust spacing to prevent overlaps
23
+ - `fig.suptitle(title)` - Set figure title
24
+ - `fig.legend()` - Create figure-level legend
25
+ - `fig.colorbar(mappable)` - Add colorbar to figure
26
+ - `plt.close(fig)` - Close figure to free memory
27
+
28
+ **Key Attributes:**
29
+ - `fig.axes` - List of all axes in the figure
30
+ - `fig.dpi` - Resolution in dots per inch
31
+ - `fig.figsize` - Figure dimensions in inches (width, height)
32
+
33
+ ### Axes
34
+
35
+ The actual plotting area where data is visualized.
36
+
37
+ **Creation:**
38
+ ```python
39
+ fig, ax = plt.subplots() # Single axes
40
+ ax = fig.add_subplot(111) # Alternative method
41
+ ```
42
+
43
+ **Plotting Methods:**
44
+
45
+ **Line plots:**
46
+ - `ax.plot(x, y, **kwargs)` - Line plot
47
+ - `ax.step(x, y, where='pre'/'mid'/'post')` - Step plot
48
+ - `ax.errorbar(x, y, yerr, xerr)` - Error bars
49
+
50
+ **Scatter plots:**
51
+ - `ax.scatter(x, y, s=size, c=color, marker='o', alpha=0.5)` - Scatter plot
52
+
53
+ **Bar charts:**
54
+ - `ax.bar(x, height, width=0.8, align='center')` - Vertical bar chart
55
+ - `ax.barh(y, width)` - Horizontal bar chart
56
+
57
+ **Statistical plots:**
58
+ - `ax.hist(data, bins=10, density=False)` - Histogram
59
+ - `ax.boxplot(data, labels=None)` - Box plot
60
+ - `ax.violinplot(data)` - Violin plot
61
+
62
+ **2D plots:**
63
+ - `ax.imshow(array, cmap='viridis', aspect='auto')` - Display image/matrix
64
+ - `ax.contour(X, Y, Z, levels=10)` - Contour lines
65
+ - `ax.contourf(X, Y, Z, levels=10)` - Filled contours
66
+ - `ax.pcolormesh(X, Y, Z)` - Pseudocolor plot
67
+
68
+ **Filling:**
69
+ - `ax.fill_between(x, y1, y2, alpha=0.3)` - Fill between curves
70
+ - `ax.fill_betweenx(y, x1, x2)` - Fill between vertical curves
71
+
72
+ **Text and annotations:**
73
+ - `ax.text(x, y, text, fontsize=12)` - Add text
74
+ - `ax.annotate(text, xy=(x, y), xytext=(x2, y2), arrowprops={})` - Annotate with arrow
75
+
76
+ **Customization Methods:**
77
+
78
+ **Labels and titles:**
79
+ - `ax.set_xlabel(label, fontsize=12)` - Set x-axis label
80
+ - `ax.set_ylabel(label, fontsize=12)` - Set y-axis label
81
+ - `ax.set_title(title, fontsize=14)` - Set axes title
82
+
83
+ **Limits and scales:**
84
+ - `ax.set_xlim(left, right)` - Set x-axis limits
85
+ - `ax.set_ylim(bottom, top)` - Set y-axis limits
86
+ - `ax.set_xscale('linear'/'log'/'symlog')` - Set x-axis scale
87
+ - `ax.set_yscale('linear'/'log'/'symlog')` - Set y-axis scale
88
+
89
+ **Ticks:**
90
+ - `ax.set_xticks(positions)` - Set x-tick positions
91
+ - `ax.set_xticklabels(labels)` - Set x-tick labels
92
+ - `ax.tick_params(axis='both', labelsize=10)` - Customize tick appearance
93
+
94
+ **Grid and spines:**
95
+ - `ax.grid(True, alpha=0.3, linestyle='--')` - Add grid
96
+ - `ax.spines['top'].set_visible(False)` - Hide top spine
97
+ - `ax.spines['right'].set_visible(False)` - Hide right spine
98
+
99
+ **Legend:**
100
+ - `ax.legend(loc='best', fontsize=10, frameon=True)` - Add legend
101
+ - `ax.legend(handles, labels)` - Custom legend
102
+
103
+ **Aspect and layout:**
104
+ - `ax.set_aspect('equal'/'auto'/ratio)` - Set aspect ratio
105
+ - `ax.invert_xaxis()` - Invert x-axis
106
+ - `ax.invert_yaxis()` - Invert y-axis
107
+
108
+ ### pyplot Module
109
+
110
+ High-level interface for quick plotting.
111
+
112
+ **Figure creation:**
113
+ - `plt.figure()` - Create new figure
114
+ - `plt.subplots()` - Create figure and axes
115
+ - `plt.subplot()` - Add subplot to current figure
116
+
117
+ **Plotting (uses current axes):**
118
+ - `plt.plot()` - Line plot
119
+ - `plt.scatter()` - Scatter plot
120
+ - `plt.bar()` - Bar chart
121
+ - `plt.hist()` - Histogram
122
+ - (All axes methods available)
123
+
124
+ **Display and save:**
125
+ - `plt.show()` - Display figure
126
+ - `plt.savefig()` - Save figure
127
+ - `plt.close()` - Close figure
128
+
129
+ **Style:**
130
+ - `plt.style.use(style_name)` - Apply style sheet
131
+ - `plt.style.available` - List available styles
132
+
133
+ **State management:**
134
+ - `plt.gca()` - Get current axes
135
+ - `plt.gcf()` - Get current figure
136
+ - `plt.sca(ax)` - Set current axes
137
+ - `plt.clf()` - Clear current figure
138
+ - `plt.cla()` - Clear current axes
139
+
140
+ ## Line and Marker Styles
141
+
142
+ ### Line Styles
143
+ - `'-'` or `'solid'` - Solid line
144
+ - `'--'` or `'dashed'` - Dashed line
145
+ - `'-.'` or `'dashdot'` - Dash-dot line
146
+ - `':'` or `'dotted'` - Dotted line
147
+ - `''` or `' '` or `'None'` - No line
148
+
149
+ ### Marker Styles
150
+ - `'.'` - Point marker
151
+ - `'o'` - Circle marker
152
+ - `'v'`, `'^'`, `'<'`, `'>'` - Triangle markers
153
+ - `'s'` - Square marker
154
+ - `'p'` - Pentagon marker
155
+ - `'*'` - Star marker
156
+ - `'h'`, `'H'` - Hexagon markers
157
+ - `'+'` - Plus marker
158
+ - `'x'` - X marker
159
+ - `'D'`, `'d'` - Diamond markers
160
+
161
+ ### Color Specifications
162
+
163
+ **Single character shortcuts:**
164
+ - `'b'` - Blue
165
+ - `'g'` - Green
166
+ - `'r'` - Red
167
+ - `'c'` - Cyan
168
+ - `'m'` - Magenta
169
+ - `'y'` - Yellow
170
+ - `'k'` - Black
171
+ - `'w'` - White
172
+
173
+ **Named colors:**
174
+ - `'steelblue'`, `'coral'`, `'teal'`, etc.
175
+ - See full list: https://matplotlib.org/stable/gallery/color/named_colors.html
176
+
177
+ **Other formats:**
178
+ - Hex: `'#FF5733'`
179
+ - RGB tuple: `(0.1, 0.2, 0.3)`
180
+ - RGBA tuple: `(0.1, 0.2, 0.3, 0.5)`
181
+
182
+ ## Common Parameters
183
+
184
+ ### Plot Function Parameters
185
+
186
+ ```python
187
+ ax.plot(x, y,
188
+ color='blue', # Line color
189
+ linewidth=2, # Line width
190
+ linestyle='--', # Line style
191
+ marker='o', # Marker style
192
+ markersize=8, # Marker size
193
+ markerfacecolor='red', # Marker fill color
194
+ markeredgecolor='black',# Marker edge color
195
+ markeredgewidth=1, # Marker edge width
196
+ alpha=0.7, # Transparency (0-1)
197
+ label='data', # Legend label
198
+ zorder=2, # Drawing order
199
+ rasterized=True # Rasterize for smaller file size
200
+ )
201
+ ```
202
+
203
+ ### Scatter Function Parameters
204
+
205
+ ```python
206
+ ax.scatter(x, y,
207
+ s=50, # Size (scalar or array)
208
+ c='blue', # Color (scalar, array, or sequence)
209
+ marker='o', # Marker style
210
+ cmap='viridis', # Colormap (if c is numeric)
211
+ alpha=0.5, # Transparency
212
+ edgecolors='black', # Edge color
213
+ linewidths=1, # Edge width
214
+ vmin=0, vmax=1, # Color scale limits
215
+ label='data' # Legend label
216
+ )
217
+ ```
218
+
219
+ ### Text Parameters
220
+
221
+ ```python
222
+ ax.text(x, y, text,
223
+ fontsize=12, # Font size
224
+ fontweight='normal', # 'normal', 'bold', 'heavy', 'light'
225
+ fontstyle='normal', # 'normal', 'italic', 'oblique'
226
+ fontfamily='sans-serif',# Font family
227
+ color='black', # Text color
228
+ alpha=1.0, # Transparency
229
+ ha='center', # Horizontal alignment: 'left', 'center', 'right'
230
+ va='center', # Vertical alignment: 'top', 'center', 'bottom', 'baseline'
231
+ rotation=0, # Rotation angle in degrees
232
+ bbox=dict( # Background box
233
+ facecolor='white',
234
+ edgecolor='black',
235
+ boxstyle='round'
236
+ )
237
+ )
238
+ ```
239
+
240
+ ## rcParams Configuration
241
+
242
+ Common rcParams settings for global customization:
243
+
244
+ ```python
245
+ # Font settings
246
+ plt.rcParams['font.family'] = 'sans-serif'
247
+ plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
248
+ plt.rcParams['font.size'] = 12
249
+
250
+ # Figure settings
251
+ plt.rcParams['figure.figsize'] = (10, 6)
252
+ plt.rcParams['figure.dpi'] = 100
253
+ plt.rcParams['figure.facecolor'] = 'white'
254
+ plt.rcParams['savefig.dpi'] = 300
255
+ plt.rcParams['savefig.bbox'] = 'tight'
256
+
257
+ # Axes settings
258
+ plt.rcParams['axes.labelsize'] = 14
259
+ plt.rcParams['axes.titlesize'] = 16
260
+ plt.rcParams['axes.grid'] = True
261
+ plt.rcParams['axes.grid.alpha'] = 0.3
262
+
263
+ # Line settings
264
+ plt.rcParams['lines.linewidth'] = 2
265
+ plt.rcParams['lines.markersize'] = 8
266
+
267
+ # Tick settings
268
+ plt.rcParams['xtick.labelsize'] = 10
269
+ plt.rcParams['ytick.labelsize'] = 10
270
+ plt.rcParams['xtick.direction'] = 'in' # 'in', 'out', 'inout'
271
+ plt.rcParams['ytick.direction'] = 'in'
272
+
273
+ # Legend settings
274
+ plt.rcParams['legend.fontsize'] = 12
275
+ plt.rcParams['legend.frameon'] = True
276
+ plt.rcParams['legend.framealpha'] = 0.8
277
+
278
+ # Grid settings
279
+ plt.rcParams['grid.alpha'] = 0.3
280
+ plt.rcParams['grid.linestyle'] = '--'
281
+ ```
282
+
283
+ ## GridSpec for Complex Layouts
284
+
285
+ ```python
286
+ from matplotlib.gridspec import GridSpec
287
+
288
+ fig = plt.figure(figsize=(12, 8))
289
+ gs = GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3)
290
+
291
+ # Span multiple cells
292
+ ax1 = fig.add_subplot(gs[0, :]) # Top row, all columns
293
+ ax2 = fig.add_subplot(gs[1:, 0]) # Bottom two rows, first column
294
+ ax3 = fig.add_subplot(gs[1, 1:]) # Middle row, last two columns
295
+ ax4 = fig.add_subplot(gs[2, 1]) # Bottom row, middle column
296
+ ax5 = fig.add_subplot(gs[2, 2]) # Bottom row, right column
297
+ ```
298
+
299
+ ## 3D Plotting
300
+
301
+ ```python
302
+ from mpl_toolkits.mplot3d import Axes3D
303
+
304
+ fig = plt.figure()
305
+ ax = fig.add_subplot(111, projection='3d')
306
+
307
+ # Plot types
308
+ ax.plot(x, y, z) # 3D line
309
+ ax.scatter(x, y, z) # 3D scatter
310
+ ax.plot_surface(X, Y, Z) # 3D surface
311
+ ax.plot_wireframe(X, Y, Z) # 3D wireframe
312
+ ax.contour(X, Y, Z) # 3D contour
313
+ ax.bar3d(x, y, z, dx, dy, dz) # 3D bar
314
+
315
+ # Customization
316
+ ax.set_xlabel('X')
317
+ ax.set_ylabel('Y')
318
+ ax.set_zlabel('Z')
319
+ ax.view_init(elev=30, azim=45) # Set viewing angle
320
+ ```
321
+
322
+ ## Animation
323
+
324
+ ```python
325
+ from matplotlib.animation import FuncAnimation
326
+
327
+ fig, ax = plt.subplots()
328
+ line, = ax.plot([], [])
329
+
330
+ def init():
331
+ ax.set_xlim(0, 2*np.pi)
332
+ ax.set_ylim(-1, 1)
333
+ return line,
334
+
335
+ def update(frame):
336
+ x = np.linspace(0, 2*np.pi, 100)
337
+ y = np.sin(x + frame/10)
338
+ line.set_data(x, y)
339
+ return line,
340
+
341
+ anim = FuncAnimation(fig, update, init_func=init,
342
+ frames=100, interval=50, blit=True)
343
+
344
+ # Save animation
345
+ anim.save('animation.gif', writer='pillow', fps=20)
346
+ anim.save('animation.mp4', writer='ffmpeg', fps=20)
347
+ ```
348
+
349
+ ## Image Operations
350
+
351
+ ```python
352
+ # Read and display image
353
+ img = plt.imread('image.png')
354
+ ax.imshow(img)
355
+
356
+ # Display matrix as image
357
+ ax.imshow(matrix, cmap='viridis', aspect='auto',
358
+ interpolation='nearest', origin='lower')
359
+
360
+ # Colorbar
361
+ cbar = plt.colorbar(im, ax=ax)
362
+ cbar.set_label('Values')
363
+
364
+ # Image extent (set coordinates)
365
+ ax.imshow(img, extent=[x_min, x_max, y_min, y_max])
366
+ ```
367
+
368
+ ## Event Handling
369
+
370
+ ```python
371
+ # Mouse click event
372
+ def on_click(event):
373
+ if event.inaxes:
374
+ print(f'Clicked at x={event.xdata:.2f}, y={event.ydata:.2f}')
375
+
376
+ fig.canvas.mpl_connect('button_press_event', on_click)
377
+
378
+ # Key press event
379
+ def on_key(event):
380
+ print(f'Key pressed: {event.key}')
381
+
382
+ fig.canvas.mpl_connect('key_press_event', on_key)
383
+ ```
384
+
385
+ ## Useful Utilities
386
+
387
+ ```python
388
+ # Get current axis limits
389
+ xlims = ax.get_xlim()
390
+ ylims = ax.get_ylim()
391
+
392
+ # Set equal aspect ratio
393
+ ax.set_aspect('equal', adjustable='box')
394
+
395
+ # Share axes between subplots
396
+ fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
397
+
398
+ # Twin axes (two y-axes)
399
+ ax2 = ax1.twinx()
400
+
401
+ # Remove tick labels
402
+ ax.set_xticklabels([])
403
+ ax.set_yticklabels([])
404
+
405
+ # Scientific notation
406
+ ax.ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
407
+
408
+ # Date formatting
409
+ import matplotlib.dates as mdates
410
+ ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
411
+ ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
412
+ ```
.scider/skills/matplotlib/references/common_issues.md ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Matplotlib Common Issues and Solutions
2
+
3
+ Troubleshooting guide for frequently encountered matplotlib problems.
4
+
5
+ ## Display and Backend Issues
6
+
7
+ ### Issue: Plots Not Showing
8
+
9
+ **Problem:** `plt.show()` doesn't display anything
10
+
11
+ **Solutions:**
12
+ ```python
13
+ # 1. Check if backend is properly set (for interactive use)
14
+ import matplotlib
15
+ print(matplotlib.get_backend())
16
+
17
+ # 2. Try different backends
18
+ matplotlib.use('TkAgg') # or 'Qt5Agg', 'MacOSX'
19
+ import matplotlib.pyplot as plt
20
+
21
+ # 3. In Jupyter notebooks, use magic command
22
+ %matplotlib inline # Static images
23
+ # or
24
+ %matplotlib widget # Interactive plots
25
+
26
+ # 4. Ensure plt.show() is called
27
+ plt.plot([1, 2, 3])
28
+ plt.show()
29
+ ```
30
+
31
+ ### Issue: "RuntimeError: main thread is not in main loop"
32
+
33
+ **Problem:** Interactive mode issues with threading
34
+
35
+ **Solution:**
36
+ ```python
37
+ # Switch to non-interactive backend
38
+ import matplotlib
39
+ matplotlib.use('Agg')
40
+ import matplotlib.pyplot as plt
41
+
42
+ # Or turn off interactive mode
43
+ plt.ioff()
44
+ ```
45
+
46
+ ### Issue: Figures Not Updating Interactively
47
+
48
+ **Problem:** Changes not reflected in interactive windows
49
+
50
+ **Solution:**
51
+ ```python
52
+ # Enable interactive mode
53
+ plt.ion()
54
+
55
+ # Draw after each change
56
+ plt.plot(x, y)
57
+ plt.draw()
58
+ plt.pause(0.001) # Brief pause to update display
59
+ ```
60
+
61
+ ## Layout and Spacing Issues
62
+
63
+ ### Issue: Overlapping Labels and Titles
64
+
65
+ **Problem:** Labels, titles, or tick labels overlap or get cut off
66
+
67
+ **Solutions:**
68
+ ```python
69
+ # Solution 1: Constrained layout (RECOMMENDED)
70
+ fig, ax = plt.subplots(constrained_layout=True)
71
+
72
+ # Solution 2: Tight layout
73
+ fig, ax = plt.subplots()
74
+ plt.tight_layout()
75
+
76
+ # Solution 3: Adjust margins manually
77
+ plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.15)
78
+
79
+ # Solution 4: Save with bbox_inches='tight'
80
+ plt.savefig('figure.png', bbox_inches='tight')
81
+
82
+ # Solution 5: Rotate long tick labels
83
+ ax.set_xticklabels(labels, rotation=45, ha='right')
84
+ ```
85
+
86
+ ### Issue: Colorbar Affects Subplot Size
87
+
88
+ **Problem:** Adding colorbar shrinks the plot
89
+
90
+ **Solution:**
91
+ ```python
92
+ # Solution 1: Use constrained layout
93
+ fig, ax = plt.subplots(constrained_layout=True)
94
+ im = ax.imshow(data)
95
+ plt.colorbar(im, ax=ax)
96
+
97
+ # Solution 2: Manually specify colorbar dimensions
98
+ from mpl_toolkits.axes_grid1 import make_axes_locatable
99
+ divider = make_axes_locatable(ax)
100
+ cax = divider.append_axes("right", size="5%", pad=0.05)
101
+ plt.colorbar(im, cax=cax)
102
+
103
+ # Solution 3: For multiple subplots, share colorbar
104
+ fig, axes = plt.subplots(1, 3, figsize=(15, 4))
105
+ for ax in axes:
106
+ im = ax.imshow(data)
107
+ fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.95)
108
+ ```
109
+
110
+ ### Issue: Subplots Too Close Together
111
+
112
+ **Problem:** Multiple subplots overlapping
113
+
114
+ **Solution:**
115
+ ```python
116
+ # Solution 1: Use constrained_layout
117
+ fig, axes = plt.subplots(2, 2, constrained_layout=True)
118
+
119
+ # Solution 2: Adjust spacing with subplots_adjust
120
+ fig, axes = plt.subplots(2, 2)
121
+ plt.subplots_adjust(hspace=0.4, wspace=0.4)
122
+
123
+ # Solution 3: Specify spacing in tight_layout
124
+ plt.tight_layout(h_pad=2.0, w_pad=2.0)
125
+ ```
126
+
127
+ ## Memory and Performance Issues
128
+
129
+ ### Issue: Memory Leak with Multiple Figures
130
+
131
+ **Problem:** Memory usage grows when creating many figures
132
+
133
+ **Solution:**
134
+ ```python
135
+ # Close figures explicitly
136
+ fig, ax = plt.subplots()
137
+ ax.plot(x, y)
138
+ plt.savefig('plot.png')
139
+ plt.close(fig) # or plt.close('all')
140
+
141
+ # Clear current figure without closing
142
+ plt.clf()
143
+
144
+ # Clear current axes
145
+ plt.cla()
146
+ ```
147
+
148
+ ### Issue: Large File Sizes
149
+
150
+ **Problem:** Saved figures are too large
151
+
152
+ **Solutions:**
153
+ ```python
154
+ # Solution 1: Reduce DPI
155
+ plt.savefig('figure.png', dpi=150) # Instead of 300
156
+
157
+ # Solution 2: Use rasterization for complex plots
158
+ ax.plot(x, y, rasterized=True)
159
+
160
+ # Solution 3: Use vector format for simple plots
161
+ plt.savefig('figure.pdf') # or .svg
162
+
163
+ # Solution 4: Compress PNG
164
+ plt.savefig('figure.png', dpi=300, optimize=True)
165
+ ```
166
+
167
+ ### Issue: Slow Plotting with Large Datasets
168
+
169
+ **Problem:** Plotting takes too long with many points
170
+
171
+ **Solutions:**
172
+ ```python
173
+ # Solution 1: Downsample data
174
+ from scipy.signal import decimate
175
+ y_downsampled = decimate(y, 10) # Keep every 10th point
176
+
177
+ # Solution 2: Use rasterization
178
+ ax.plot(x, y, rasterized=True)
179
+
180
+ # Solution 3: Use line simplification
181
+ ax.plot(x, y)
182
+ for line in ax.get_lines():
183
+ line.set_rasterized(True)
184
+
185
+ # Solution 4: For scatter plots, consider hexbin or 2d histogram
186
+ ax.hexbin(x, y, gridsize=50, cmap='viridis')
187
+ ```
188
+
189
+ ## Font and Text Issues
190
+
191
+ ### Issue: Font Warnings
192
+
193
+ **Problem:** "findfont: Font family [...] not found"
194
+
195
+ **Solutions:**
196
+ ```python
197
+ # Solution 1: Use available fonts
198
+ from matplotlib.font_manager import findfont, FontProperties
199
+ print(findfont(FontProperties(family='sans-serif')))
200
+
201
+ # Solution 2: Rebuild font cache
202
+ import matplotlib.font_manager
203
+ matplotlib.font_manager._rebuild()
204
+
205
+ # Solution 3: Suppress warnings
206
+ import warnings
207
+ warnings.filterwarnings("ignore", category=UserWarning)
208
+
209
+ # Solution 4: Specify fallback fonts
210
+ plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans', 'sans-serif']
211
+ ```
212
+
213
+ ### Issue: LaTeX Rendering Errors
214
+
215
+ **Problem:** Math text not rendering correctly
216
+
217
+ **Solutions:**
218
+ ```python
219
+ # Solution 1: Use raw strings with r prefix
220
+ ax.set_xlabel(r'$\alpha$') # Not '\alpha'
221
+
222
+ # Solution 2: Escape backslashes in regular strings
223
+ ax.set_xlabel('$\\alpha$')
224
+
225
+ # Solution 3: Disable LaTeX if not installed
226
+ plt.rcParams['text.usetex'] = False
227
+
228
+ # Solution 4: Use mathtext instead of full LaTeX
229
+ # Mathtext is always available, no LaTeX installation needed
230
+ ax.text(x, y, r'$\int_0^\infty e^{-x} dx$')
231
+ ```
232
+
233
+ ### Issue: Text Cut Off or Outside Figure
234
+
235
+ **Problem:** Labels or annotations appear outside figure bounds
236
+
237
+ **Solutions:**
238
+ ```python
239
+ # Solution 1: Use bbox_inches='tight'
240
+ plt.savefig('figure.png', bbox_inches='tight')
241
+
242
+ # Solution 2: Adjust figure bounds
243
+ plt.subplots_adjust(left=0.15, right=0.85, top=0.85, bottom=0.15)
244
+
245
+ # Solution 3: Clip text to axes
246
+ ax.text(x, y, 'text', clip_on=True)
247
+
248
+ # Solution 4: Use constrained_layout
249
+ fig, ax = plt.subplots(constrained_layout=True)
250
+ ```
251
+
252
+ ## Color and Colormap Issues
253
+
254
+ ### Issue: Colorbar Not Matching Plot
255
+
256
+ **Problem:** Colorbar shows different range than data
257
+
258
+ **Solution:**
259
+ ```python
260
+ # Explicitly set vmin and vmax
261
+ im = ax.imshow(data, vmin=0, vmax=1, cmap='viridis')
262
+ plt.colorbar(im, ax=ax)
263
+
264
+ # Or use the same norm for multiple plots
265
+ import matplotlib.colors as mcolors
266
+ norm = mcolors.Normalize(vmin=data.min(), vmax=data.max())
267
+ im1 = ax1.imshow(data1, norm=norm, cmap='viridis')
268
+ im2 = ax2.imshow(data2, norm=norm, cmap='viridis')
269
+ ```
270
+
271
+ ### Issue: Colors Look Wrong
272
+
273
+ **Problem:** Unexpected colors in plots
274
+
275
+ **Solutions:**
276
+ ```python
277
+ # Solution 1: Check color specification format
278
+ ax.plot(x, y, color='blue') # Correct
279
+ ax.plot(x, y, color=(0, 0, 1)) # Correct RGB
280
+ ax.plot(x, y, color='#0000FF') # Correct hex
281
+
282
+ # Solution 2: Verify colormap exists
283
+ print(plt.colormaps()) # List available colormaps
284
+
285
+ # Solution 3: For scatter plots, ensure c shape matches
286
+ ax.scatter(x, y, c=colors) # colors should have same length as x, y
287
+
288
+ # Solution 4: Check if alpha is set correctly
289
+ ax.plot(x, y, alpha=1.0) # 0=transparent, 1=opaque
290
+ ```
291
+
292
+ ### Issue: Reversed Colormap
293
+
294
+ **Problem:** Colormap direction is backwards
295
+
296
+ **Solution:**
297
+ ```python
298
+ # Add _r suffix to reverse any colormap
299
+ ax.imshow(data, cmap='viridis_r')
300
+ ```
301
+
302
+ ## Axis and Scale Issues
303
+
304
+ ### Issue: Axis Limits Not Working
305
+
306
+ **Problem:** `set_xlim` or `set_ylim` not taking effect
307
+
308
+ **Solutions:**
309
+ ```python
310
+ # Solution 1: Set after plotting
311
+ ax.plot(x, y)
312
+ ax.set_xlim(0, 10)
313
+ ax.set_ylim(-1, 1)
314
+
315
+ # Solution 2: Disable autoscaling
316
+ ax.autoscale(False)
317
+ ax.set_xlim(0, 10)
318
+
319
+ # Solution 3: Use axis method
320
+ ax.axis([xmin, xmax, ymin, ymax])
321
+ ```
322
+
323
+ ### Issue: Log Scale with Zero or Negative Values
324
+
325
+ **Problem:** ValueError when using log scale with data ≤ 0
326
+
327
+ **Solutions:**
328
+ ```python
329
+ # Solution 1: Filter out non-positive values
330
+ mask = (data > 0)
331
+ ax.plot(x[mask], data[mask])
332
+ ax.set_yscale('log')
333
+
334
+ # Solution 2: Use symlog for data with positive and negative values
335
+ ax.set_yscale('symlog')
336
+
337
+ # Solution 3: Add small offset
338
+ ax.plot(x, data + 1e-10)
339
+ ax.set_yscale('log')
340
+ ```
341
+
342
+ ### Issue: Dates Not Displaying Correctly
343
+
344
+ **Problem:** Date axis shows numbers instead of dates
345
+
346
+ **Solution:**
347
+ ```python
348
+ import matplotlib.dates as mdates
349
+ import pandas as pd
350
+
351
+ # Convert to datetime if needed
352
+ dates = pd.to_datetime(date_strings)
353
+
354
+ ax.plot(dates, values)
355
+
356
+ # Format date axis
357
+ ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
358
+ ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
359
+ plt.xticks(rotation=45)
360
+ ```
361
+
362
+ ## Legend Issues
363
+
364
+ ### Issue: Legend Covers Data
365
+
366
+ **Problem:** Legend obscures important parts of plot
367
+
368
+ **Solutions:**
369
+ ```python
370
+ # Solution 1: Use 'best' location
371
+ ax.legend(loc='best')
372
+
373
+ # Solution 2: Place outside plot area
374
+ ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
375
+
376
+ # Solution 3: Make legend semi-transparent
377
+ ax.legend(framealpha=0.7)
378
+
379
+ # Solution 4: Put legend below plot
380
+ ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)
381
+ ```
382
+
383
+ ### Issue: Too Many Items in Legend
384
+
385
+ **Problem:** Legend is cluttered with many entries
386
+
387
+ **Solutions:**
388
+ ```python
389
+ # Solution 1: Only label selected items
390
+ for i, (x, y) in enumerate(data):
391
+ label = f'Data {i}' if i % 5 == 0 else None
392
+ ax.plot(x, y, label=label)
393
+
394
+ # Solution 2: Use multiple columns
395
+ ax.legend(ncol=3)
396
+
397
+ # Solution 3: Create custom legend with fewer entries
398
+ from matplotlib.lines import Line2D
399
+ custom_lines = [Line2D([0], [0], color='r'),
400
+ Line2D([0], [0], color='b')]
401
+ ax.legend(custom_lines, ['Category A', 'Category B'])
402
+
403
+ # Solution 4: Use separate legend figure
404
+ fig_leg = plt.figure(figsize=(3, 2))
405
+ ax_leg = fig_leg.add_subplot(111)
406
+ ax_leg.legend(*ax.get_legend_handles_labels(), loc='center')
407
+ ax_leg.axis('off')
408
+ ```
409
+
410
+ ## 3D Plot Issues
411
+
412
+ ### Issue: 3D Plots Look Flat
413
+
414
+ **Problem:** Difficult to perceive depth in 3D plots
415
+
416
+ **Solutions:**
417
+ ```python
418
+ # Solution 1: Adjust viewing angle
419
+ ax.view_init(elev=30, azim=45)
420
+
421
+ # Solution 2: Add gridlines
422
+ ax.grid(True)
423
+
424
+ # Solution 3: Use color for depth
425
+ scatter = ax.scatter(x, y, z, c=z, cmap='viridis')
426
+
427
+ # Solution 4: Rotate interactively (if using interactive backend)
428
+ # User can click and drag to rotate
429
+ ```
430
+
431
+ ### Issue: 3D Axis Labels Cut Off
432
+
433
+ **Problem:** 3D axis labels appear outside figure
434
+
435
+ **Solution:**
436
+ ```python
437
+ from mpl_toolkits.mplot3d import Axes3D
438
+
439
+ fig = plt.figure(figsize=(10, 8))
440
+ ax = fig.add_subplot(111, projection='3d')
441
+ ax.plot_surface(X, Y, Z)
442
+
443
+ # Add padding
444
+ fig.tight_layout(pad=3.0)
445
+
446
+ # Or save with tight bounding box
447
+ plt.savefig('3d_plot.png', bbox_inches='tight', pad_inches=0.5)
448
+ ```
449
+
450
+ ## Image and Colorbar Issues
451
+
452
+ ### Issue: Images Appear Flipped
453
+
454
+ **Problem:** Image orientation is wrong
455
+
456
+ **Solution:**
457
+ ```python
458
+ # Set origin parameter
459
+ ax.imshow(img, origin='lower') # or 'upper' (default)
460
+
461
+ # Or flip array
462
+ ax.imshow(np.flipud(img))
463
+ ```
464
+
465
+ ### Issue: Images Look Pixelated
466
+
467
+ **Problem:** Image appears blocky when zoomed
468
+
469
+ **Solutions:**
470
+ ```python
471
+ # Solution 1: Use interpolation
472
+ ax.imshow(img, interpolation='bilinear')
473
+ # Options: 'nearest', 'bilinear', 'bicubic', 'spline16', 'spline36', etc.
474
+
475
+ # Solution 2: Increase DPI when saving
476
+ plt.savefig('figure.png', dpi=300)
477
+
478
+ # Solution 3: Use vector format if appropriate
479
+ plt.savefig('figure.pdf')
480
+ ```
481
+
482
+ ## Common Errors and Fixes
483
+
484
+ ### "TypeError: 'AxesSubplot' object is not subscriptable"
485
+
486
+ **Problem:** Trying to index single axes
487
+ ```python
488
+ # Wrong
489
+ fig, ax = plt.subplots()
490
+ ax[0].plot(x, y) # Error!
491
+
492
+ # Correct
493
+ fig, ax = plt.subplots()
494
+ ax.plot(x, y)
495
+ ```
496
+
497
+ ### "ValueError: x and y must have same first dimension"
498
+
499
+ **Problem:** Data arrays have mismatched lengths
500
+ ```python
501
+ # Check shapes
502
+ print(f"x shape: {x.shape}, y shape: {y.shape}")
503
+
504
+ # Ensure they match
505
+ assert len(x) == len(y), "x and y must have same length"
506
+ ```
507
+
508
+ ### "AttributeError: 'numpy.ndarray' object has no attribute 'plot'"
509
+
510
+ **Problem:** Calling plot on array instead of axes
511
+ ```python
512
+ # Wrong
513
+ data.plot(x, y)
514
+
515
+ # Correct
516
+ ax.plot(x, y)
517
+ # or for pandas
518
+ data.plot(ax=ax)
519
+ ```
520
+
521
+ ## Best Practices to Avoid Issues
522
+
523
+ 1. **Always use the OO interface** - Avoid pyplot state machine
524
+ ```python
525
+ fig, ax = plt.subplots() # Good
526
+ ax.plot(x, y)
527
+ ```
528
+
529
+ 2. **Use constrained_layout** - Prevents overlap issues
530
+ ```python
531
+ fig, ax = plt.subplots(constrained_layout=True)
532
+ ```
533
+
534
+ 3. **Close figures explicitly** - Prevents memory leaks
535
+ ```python
536
+ plt.close(fig)
537
+ ```
538
+
539
+ 4. **Set figure size at creation** - Better than resizing later
540
+ ```python
541
+ fig, ax = plt.subplots(figsize=(10, 6))
542
+ ```
543
+
544
+ 5. **Use raw strings for math text** - Avoids escape issues
545
+ ```python
546
+ ax.set_xlabel(r'$\alpha$')
547
+ ```
548
+
549
+ 6. **Check data shapes before plotting** - Catch size mismatches early
550
+ ```python
551
+ assert len(x) == len(y)
552
+ ```
553
+
554
+ 7. **Use appropriate DPI** - 300 for print, 150 for web
555
+ ```python
556
+ plt.savefig('figure.png', dpi=300)
557
+ ```
558
+
559
+ 8. **Test with different backends** - If display issues occur
560
+ ```python
561
+ import matplotlib
562
+ matplotlib.use('TkAgg')
563
+ ```
.scider/skills/matplotlib/references/plot_types.md ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Matplotlib Plot Types Guide
2
+
3
+ Comprehensive guide to different plot types in matplotlib with examples and use cases.
4
+
5
+ ## 1. Line Plots
6
+
7
+ **Use cases:** Time series, continuous data, trends, function visualization
8
+
9
+ ### Basic Line Plot
10
+ ```python
11
+ fig, ax = plt.subplots(figsize=(10, 6))
12
+ ax.plot(x, y, linewidth=2, label='Data')
13
+ ax.set_xlabel('X axis')
14
+ ax.set_ylabel('Y axis')
15
+ ax.legend()
16
+ ```
17
+
18
+ ### Multiple Lines
19
+ ```python
20
+ ax.plot(x, y1, label='Dataset 1', linewidth=2)
21
+ ax.plot(x, y2, label='Dataset 2', linewidth=2, linestyle='--')
22
+ ax.plot(x, y3, label='Dataset 3', linewidth=2, linestyle=':')
23
+ ax.legend()
24
+ ```
25
+
26
+ ### Line with Markers
27
+ ```python
28
+ ax.plot(x, y, marker='o', markersize=8, linestyle='-',
29
+ linewidth=2, markerfacecolor='red', markeredgecolor='black')
30
+ ```
31
+
32
+ ### Step Plot
33
+ ```python
34
+ ax.step(x, y, where='mid', linewidth=2, label='Step function')
35
+ # where options: 'pre', 'post', 'mid'
36
+ ```
37
+
38
+ ### Error Bars
39
+ ```python
40
+ ax.errorbar(x, y, yerr=error, fmt='o-', linewidth=2,
41
+ capsize=5, capthick=2, label='With uncertainty')
42
+ ```
43
+
44
+ ## 2. Scatter Plots
45
+
46
+ **Use cases:** Correlations, relationships between variables, clusters, outliers
47
+
48
+ ### Basic Scatter
49
+ ```python
50
+ ax.scatter(x, y, s=50, alpha=0.6)
51
+ ```
52
+
53
+ ### Sized and Colored Scatter
54
+ ```python
55
+ scatter = ax.scatter(x, y, s=sizes*100, c=colors,
56
+ cmap='viridis', alpha=0.6, edgecolors='black')
57
+ plt.colorbar(scatter, ax=ax, label='Color variable')
58
+ ```
59
+
60
+ ### Categorical Scatter
61
+ ```python
62
+ for category in categories:
63
+ mask = data['category'] == category
64
+ ax.scatter(data[mask]['x'], data[mask]['y'],
65
+ label=category, s=50, alpha=0.7)
66
+ ax.legend()
67
+ ```
68
+
69
+ ## 3. Bar Charts
70
+
71
+ **Use cases:** Categorical comparisons, discrete data, counts
72
+
73
+ ### Vertical Bar Chart
74
+ ```python
75
+ ax.bar(categories, values, color='steelblue',
76
+ edgecolor='black', linewidth=1.5)
77
+ ax.set_ylabel('Values')
78
+ ```
79
+
80
+ ### Horizontal Bar Chart
81
+ ```python
82
+ ax.barh(categories, values, color='coral',
83
+ edgecolor='black', linewidth=1.5)
84
+ ax.set_xlabel('Values')
85
+ ```
86
+
87
+ ### Grouped Bar Chart
88
+ ```python
89
+ x = np.arange(len(categories))
90
+ width = 0.35
91
+
92
+ ax.bar(x - width/2, values1, width, label='Group 1')
93
+ ax.bar(x + width/2, values2, width, label='Group 2')
94
+ ax.set_xticks(x)
95
+ ax.set_xticklabels(categories)
96
+ ax.legend()
97
+ ```
98
+
99
+ ### Stacked Bar Chart
100
+ ```python
101
+ ax.bar(categories, values1, label='Part 1')
102
+ ax.bar(categories, values2, bottom=values1, label='Part 2')
103
+ ax.bar(categories, values3, bottom=values1+values2, label='Part 3')
104
+ ax.legend()
105
+ ```
106
+
107
+ ### Bar Chart with Error Bars
108
+ ```python
109
+ ax.bar(categories, values, yerr=errors, capsize=5,
110
+ color='steelblue', edgecolor='black')
111
+ ```
112
+
113
+ ### Bar Chart with Patterns
114
+ ```python
115
+ bars1 = ax.bar(x - width/2, values1, width, label='Group 1',
116
+ color='white', edgecolor='black', hatch='//')
117
+ bars2 = ax.bar(x + width/2, values2, width, label='Group 2',
118
+ color='white', edgecolor='black', hatch='\\\\')
119
+ ```
120
+
121
+ ## 4. Histograms
122
+
123
+ **Use cases:** Distributions, frequency analysis
124
+
125
+ ### Basic Histogram
126
+ ```python
127
+ ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
128
+ ax.set_xlabel('Value')
129
+ ax.set_ylabel('Frequency')
130
+ ```
131
+
132
+ ### Multiple Overlapping Histograms
133
+ ```python
134
+ ax.hist(data1, bins=30, alpha=0.5, label='Dataset 1')
135
+ ax.hist(data2, bins=30, alpha=0.5, label='Dataset 2')
136
+ ax.legend()
137
+ ```
138
+
139
+ ### Normalized Histogram (Density)
140
+ ```python
141
+ ax.hist(data, bins=30, density=True, alpha=0.7,
142
+ edgecolor='black', label='Empirical')
143
+
144
+ # Overlay theoretical distribution
145
+ from scipy.stats import norm
146
+ x = np.linspace(data.min(), data.max(), 100)
147
+ ax.plot(x, norm.pdf(x, data.mean(), data.std()),
148
+ 'r-', linewidth=2, label='Normal fit')
149
+ ax.legend()
150
+ ```
151
+
152
+ ### 2D Histogram (Hexbin)
153
+ ```python
154
+ hexbin = ax.hexbin(x, y, gridsize=30, cmap='Blues')
155
+ plt.colorbar(hexbin, ax=ax, label='Counts')
156
+ ```
157
+
158
+ ### 2D Histogram (hist2d)
159
+ ```python
160
+ h = ax.hist2d(x, y, bins=30, cmap='Blues')
161
+ plt.colorbar(h[3], ax=ax, label='Counts')
162
+ ```
163
+
164
+ ## 5. Box and Violin Plots
165
+
166
+ **Use cases:** Statistical distributions, outlier detection, comparing distributions
167
+
168
+ ### Box Plot
169
+ ```python
170
+ ax.boxplot([data1, data2, data3],
171
+ labels=['Group A', 'Group B', 'Group C'],
172
+ showmeans=True, meanline=True)
173
+ ax.set_ylabel('Values')
174
+ ```
175
+
176
+ ### Horizontal Box Plot
177
+ ```python
178
+ ax.boxplot([data1, data2, data3], vert=False,
179
+ labels=['Group A', 'Group B', 'Group C'])
180
+ ax.set_xlabel('Values')
181
+ ```
182
+
183
+ ### Violin Plot
184
+ ```python
185
+ parts = ax.violinplot([data1, data2, data3],
186
+ positions=[1, 2, 3],
187
+ showmeans=True, showmedians=True)
188
+ ax.set_xticks([1, 2, 3])
189
+ ax.set_xticklabels(['Group A', 'Group B', 'Group C'])
190
+ ```
191
+
192
+ ## 6. Heatmaps
193
+
194
+ **Use cases:** Matrix data, correlations, intensity maps
195
+
196
+ ### Basic Heatmap
197
+ ```python
198
+ im = ax.imshow(matrix, cmap='coolwarm', aspect='auto')
199
+ plt.colorbar(im, ax=ax, label='Values')
200
+ ax.set_xlabel('X')
201
+ ax.set_ylabel('Y')
202
+ ```
203
+
204
+ ### Heatmap with Annotations
205
+ ```python
206
+ im = ax.imshow(matrix, cmap='coolwarm')
207
+ plt.colorbar(im, ax=ax)
208
+
209
+ # Add text annotations
210
+ for i in range(matrix.shape[0]):
211
+ for j in range(matrix.shape[1]):
212
+ text = ax.text(j, i, f'{matrix[i, j]:.2f}',
213
+ ha='center', va='center', color='black')
214
+ ```
215
+
216
+ ### Correlation Matrix
217
+ ```python
218
+ corr = data.corr()
219
+ im = ax.imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)
220
+ plt.colorbar(im, ax=ax, label='Correlation')
221
+
222
+ # Set tick labels
223
+ ax.set_xticks(range(len(corr)))
224
+ ax.set_yticks(range(len(corr)))
225
+ ax.set_xticklabels(corr.columns, rotation=45, ha='right')
226
+ ax.set_yticklabels(corr.columns)
227
+ ```
228
+
229
+ ## 7. Contour Plots
230
+
231
+ **Use cases:** 3D data on 2D plane, topography, function visualization
232
+
233
+ ### Contour Lines
234
+ ```python
235
+ contour = ax.contour(X, Y, Z, levels=10, cmap='viridis')
236
+ ax.clabel(contour, inline=True, fontsize=8)
237
+ plt.colorbar(contour, ax=ax)
238
+ ```
239
+
240
+ ### Filled Contours
241
+ ```python
242
+ contourf = ax.contourf(X, Y, Z, levels=20, cmap='viridis')
243
+ plt.colorbar(contourf, ax=ax)
244
+ ```
245
+
246
+ ### Combined Contours
247
+ ```python
248
+ contourf = ax.contourf(X, Y, Z, levels=20, cmap='viridis', alpha=0.8)
249
+ contour = ax.contour(X, Y, Z, levels=10, colors='black',
250
+ linewidths=0.5, alpha=0.4)
251
+ ax.clabel(contour, inline=True, fontsize=8)
252
+ plt.colorbar(contourf, ax=ax)
253
+ ```
254
+
255
+ ## 8. Pie Charts
256
+
257
+ **Use cases:** Proportions, percentages (use sparingly)
258
+
259
+ ### Basic Pie Chart
260
+ ```python
261
+ ax.pie(sizes, labels=labels, autopct='%1.1f%%',
262
+ startangle=90, colors=colors)
263
+ ax.axis('equal') # Equal aspect ratio ensures circular pie
264
+ ```
265
+
266
+ ### Exploded Pie Chart
267
+ ```python
268
+ explode = (0.1, 0, 0, 0) # Explode first slice
269
+ ax.pie(sizes, explode=explode, labels=labels,
270
+ autopct='%1.1f%%', shadow=True, startangle=90)
271
+ ax.axis('equal')
272
+ ```
273
+
274
+ ### Donut Chart
275
+ ```python
276
+ ax.pie(sizes, labels=labels, autopct='%1.1f%%',
277
+ wedgeprops=dict(width=0.5), startangle=90)
278
+ ax.axis('equal')
279
+ ```
280
+
281
+ ## 9. Polar Plots
282
+
283
+ **Use cases:** Cyclic data, directional data, radar charts
284
+
285
+ ### Basic Polar Plot
286
+ ```python
287
+ theta = np.linspace(0, 2*np.pi, 100)
288
+ r = np.abs(np.sin(2*theta))
289
+
290
+ ax = plt.subplot(111, projection='polar')
291
+ ax.plot(theta, r, linewidth=2)
292
+ ```
293
+
294
+ ### Radar Chart
295
+ ```python
296
+ categories = ['A', 'B', 'C', 'D', 'E']
297
+ values = [4, 3, 5, 2, 4]
298
+
299
+ # Add first value to the end to close the polygon
300
+ angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False)
301
+ values_closed = np.concatenate((values, [values[0]]))
302
+ angles_closed = np.concatenate((angles, [angles[0]]))
303
+
304
+ ax = plt.subplot(111, projection='polar')
305
+ ax.plot(angles_closed, values_closed, 'o-', linewidth=2)
306
+ ax.fill(angles_closed, values_closed, alpha=0.25)
307
+ ax.set_xticks(angles)
308
+ ax.set_xticklabels(categories)
309
+ ```
310
+
311
+ ## 10. Stream and Quiver Plots
312
+
313
+ **Use cases:** Vector fields, flow visualization
314
+
315
+ ### Quiver Plot (Vector Field)
316
+ ```python
317
+ ax.quiver(X, Y, U, V, alpha=0.8)
318
+ ax.set_xlabel('X')
319
+ ax.set_ylabel('Y')
320
+ ax.set_aspect('equal')
321
+ ```
322
+
323
+ ### Stream Plot
324
+ ```python
325
+ ax.streamplot(X, Y, U, V, density=1.5, color='k', linewidth=1)
326
+ ax.set_xlabel('X')
327
+ ax.set_ylabel('Y')
328
+ ax.set_aspect('equal')
329
+ ```
330
+
331
+ ## 11. Fill Between
332
+
333
+ **Use cases:** Uncertainty bounds, confidence intervals, areas under curves
334
+
335
+ ### Fill Between Two Curves
336
+ ```python
337
+ ax.plot(x, y, 'k-', linewidth=2, label='Mean')
338
+ ax.fill_between(x, y - std, y + std, alpha=0.3,
339
+ label='±1 std dev')
340
+ ax.legend()
341
+ ```
342
+
343
+ ### Fill Between with Condition
344
+ ```python
345
+ ax.plot(x, y1, label='Line 1')
346
+ ax.plot(x, y2, label='Line 2')
347
+ ax.fill_between(x, y1, y2, where=(y2 >= y1),
348
+ alpha=0.3, label='y2 > y1', interpolate=True)
349
+ ax.legend()
350
+ ```
351
+
352
+ ## 12. 3D Plots
353
+
354
+ **Use cases:** Three-dimensional data visualization
355
+
356
+ ### 3D Scatter
357
+ ```python
358
+ from mpl_toolkits.mplot3d import Axes3D
359
+
360
+ fig = plt.figure(figsize=(10, 8))
361
+ ax = fig.add_subplot(111, projection='3d')
362
+ scatter = ax.scatter(x, y, z, c=colors, cmap='viridis',
363
+ marker='o', s=50)
364
+ plt.colorbar(scatter, ax=ax)
365
+ ax.set_xlabel('X')
366
+ ax.set_ylabel('Y')
367
+ ax.set_zlabel('Z')
368
+ ```
369
+
370
+ ### 3D Surface Plot
371
+ ```python
372
+ fig = plt.figure(figsize=(10, 8))
373
+ ax = fig.add_subplot(111, projection='3d')
374
+ surf = ax.plot_surface(X, Y, Z, cmap='viridis',
375
+ edgecolor='none', alpha=0.9)
376
+ plt.colorbar(surf, ax=ax)
377
+ ax.set_xlabel('X')
378
+ ax.set_ylabel('Y')
379
+ ax.set_zlabel('Z')
380
+ ```
381
+
382
+ ### 3D Wireframe
383
+ ```python
384
+ fig = plt.figure(figsize=(10, 8))
385
+ ax = fig.add_subplot(111, projection='3d')
386
+ ax.plot_wireframe(X, Y, Z, color='black', linewidth=0.5)
387
+ ax.set_xlabel('X')
388
+ ax.set_ylabel('Y')
389
+ ax.set_zlabel('Z')
390
+ ```
391
+
392
+ ### 3D Contour
393
+ ```python
394
+ fig = plt.figure(figsize=(10, 8))
395
+ ax = fig.add_subplot(111, projection='3d')
396
+ ax.contour(X, Y, Z, levels=15, cmap='viridis')
397
+ ax.set_xlabel('X')
398
+ ax.set_ylabel('Y')
399
+ ax.set_zlabel('Z')
400
+ ```
401
+
402
+ ## 13. Specialized Plots
403
+
404
+ ### Stem Plot
405
+ ```python
406
+ ax.stem(x, y, linefmt='C0-', markerfmt='C0o', basefmt='k-')
407
+ ax.set_xlabel('X')
408
+ ax.set_ylabel('Y')
409
+ ```
410
+
411
+ ### Filled Polygon
412
+ ```python
413
+ vertices = [(0, 0), (1, 0), (1, 1), (0, 1)]
414
+ from matplotlib.patches import Polygon
415
+ polygon = Polygon(vertices, closed=True, edgecolor='black',
416
+ facecolor='lightblue', alpha=0.5)
417
+ ax.add_patch(polygon)
418
+ ax.set_xlim(-0.5, 1.5)
419
+ ax.set_ylim(-0.5, 1.5)
420
+ ```
421
+
422
+ ### Staircase Plot
423
+ ```python
424
+ ax.stairs(values, edges, fill=True, alpha=0.5)
425
+ ```
426
+
427
+ ### Broken Barh (Gantt-style)
428
+ ```python
429
+ ax.broken_barh([(10, 50), (100, 20), (130, 10)], (10, 9),
430
+ facecolors='tab:blue')
431
+ ax.broken_barh([(10, 20), (50, 50), (120, 30)], (20, 9),
432
+ facecolors='tab:orange')
433
+ ax.set_ylim(5, 35)
434
+ ax.set_xlim(0, 200)
435
+ ax.set_xlabel('Time')
436
+ ax.set_yticks([15, 25])
437
+ ax.set_yticklabels(['Task 1', 'Task 2'])
438
+ ```
439
+
440
+ ## 14. Time Series Plots
441
+
442
+ ### Basic Time Series
443
+ ```python
444
+ import pandas as pd
445
+ import matplotlib.dates as mdates
446
+
447
+ ax.plot(dates, values, linewidth=2)
448
+ ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
449
+ ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
450
+ plt.xticks(rotation=45)
451
+ ax.set_xlabel('Date')
452
+ ax.set_ylabel('Value')
453
+ ```
454
+
455
+ ### Time Series with Shaded Regions
456
+ ```python
457
+ ax.plot(dates, values, linewidth=2)
458
+ # Shade weekends or specific periods
459
+ ax.axvspan(start_date, end_date, alpha=0.2, color='gray')
460
+ ```
461
+
462
+ ## Plot Selection Guide
463
+
464
+ | Data Type | Recommended Plot | Alternative Options |
465
+ |-----------|-----------------|---------------------|
466
+ | Single continuous variable | Histogram, KDE | Box plot, Violin plot |
467
+ | Two continuous variables | Scatter plot | Hexbin, 2D histogram |
468
+ | Time series | Line plot | Area plot, Step plot |
469
+ | Categorical vs continuous | Bar chart, Box plot | Violin plot, Strip plot |
470
+ | Two categorical variables | Heatmap | Grouped bar chart |
471
+ | Three continuous variables | 3D scatter, Contour | Color-coded scatter |
472
+ | Proportions | Bar chart | Pie chart (use sparingly) |
473
+ | Distributions comparison | Box plot, Violin plot | Overlaid histograms |
474
+ | Correlation matrix | Heatmap | Clustered heatmap |
475
+ | Vector field | Quiver plot, Stream plot | - |
476
+ | Function visualization | Line plot, Contour | 3D surface |
.scider/skills/matplotlib/references/styling_guide.md ADDED
@@ -0,0 +1,589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Matplotlib Styling Guide
2
+
3
+ Comprehensive guide for styling and customizing matplotlib visualizations.
4
+
5
+ ## Colormaps
6
+
7
+ ### Colormap Categories
8
+
9
+ **1. Perceptually Uniform Sequential**
10
+ Best for ordered data that progresses from low to high values.
11
+ - `viridis` (default, colorblind-friendly)
12
+ - `plasma`
13
+ - `inferno`
14
+ - `magma`
15
+ - `cividis` (optimized for colorblind viewers)
16
+
17
+ **Usage:**
18
+ ```python
19
+ im = ax.imshow(data, cmap='viridis')
20
+ scatter = ax.scatter(x, y, c=values, cmap='plasma')
21
+ ```
22
+
23
+ **2. Sequential**
24
+ Traditional colormaps for ordered data.
25
+ - `Blues`, `Greens`, `Reds`, `Oranges`, `Purples`
26
+ - `YlOrBr`, `YlOrRd`, `OrRd`, `PuRd`
27
+ - `BuPu`, `GnBu`, `PuBu`, `YlGnBu`
28
+
29
+ **3. Diverging**
30
+ Best for data with a meaningful center point (e.g., zero, mean).
31
+ - `coolwarm` (blue to red)
32
+ - `RdBu` (red-blue)
33
+ - `RdYlBu` (red-yellow-blue)
34
+ - `RdYlGn` (red-yellow-green)
35
+ - `PiYG`, `PRGn`, `BrBG`, `PuOr`, `RdGy`
36
+
37
+ **Usage:**
38
+ ```python
39
+ # Center colormap at zero
40
+ im = ax.imshow(data, cmap='coolwarm', vmin=-1, vmax=1)
41
+ ```
42
+
43
+ **4. Qualitative**
44
+ Best for categorical/nominal data without inherent ordering.
45
+ - `tab10` (10 distinct colors)
46
+ - `tab20` (20 distinct colors)
47
+ - `Set1`, `Set2`, `Set3`
48
+ - `Pastel1`, `Pastel2`
49
+ - `Dark2`, `Accent`, `Paired`
50
+
51
+ **Usage:**
52
+ ```python
53
+ colors = plt.cm.tab10(np.linspace(0, 1, n_categories))
54
+ for i, category in enumerate(categories):
55
+ ax.plot(x, y[i], color=colors[i], label=category)
56
+ ```
57
+
58
+ **5. Cyclic**
59
+ Best for cyclic data (e.g., phase, angle).
60
+ - `twilight`
61
+ - `twilight_shifted`
62
+ - `hsv`
63
+
64
+ ### Colormap Best Practices
65
+
66
+ 1. **Avoid `jet` colormap** - Not perceptually uniform, misleading
67
+ 2. **Use perceptually uniform colormaps** - `viridis`, `plasma`, `cividis`
68
+ 3. **Consider colorblind users** - Use `viridis`, `cividis`, or test with colorblind simulators
69
+ 4. **Match colormap to data type**:
70
+ - Sequential: increasing/decreasing data
71
+ - Diverging: data with meaningful center
72
+ - Qualitative: categories
73
+ 5. **Reverse colormaps** - Add `_r` suffix: `viridis_r`, `coolwarm_r`
74
+
75
+ ### Creating Custom Colormaps
76
+
77
+ ```python
78
+ from matplotlib.colors import LinearSegmentedColormap
79
+
80
+ # From color list
81
+ colors = ['blue', 'white', 'red']
82
+ n_bins = 100
83
+ cmap = LinearSegmentedColormap.from_list('custom', colors, N=n_bins)
84
+
85
+ # From RGB values
86
+ colors = [(0, 0, 1), (1, 1, 1), (1, 0, 0)] # RGB tuples
87
+ cmap = LinearSegmentedColormap.from_list('custom', colors)
88
+
89
+ # Use the custom colormap
90
+ ax.imshow(data, cmap=cmap)
91
+ ```
92
+
93
+ ### Discrete Colormaps
94
+
95
+ ```python
96
+ import matplotlib.colors as mcolors
97
+
98
+ # Create discrete colormap from continuous
99
+ cmap = plt.cm.viridis
100
+ bounds = np.linspace(0, 10, 11)
101
+ norm = mcolors.BoundaryNorm(bounds, cmap.N)
102
+ im = ax.imshow(data, cmap=cmap, norm=norm)
103
+ ```
104
+
105
+ ## Style Sheets
106
+
107
+ ### Using Built-in Styles
108
+
109
+ ```python
110
+ # List available styles
111
+ print(plt.style.available)
112
+
113
+ # Apply a style
114
+ plt.style.use('seaborn-v0_8-darkgrid')
115
+
116
+ # Apply multiple styles (later styles override earlier ones)
117
+ plt.style.use(['seaborn-v0_8-whitegrid', 'seaborn-v0_8-poster'])
118
+
119
+ # Temporarily use a style
120
+ with plt.style.context('ggplot'):
121
+ fig, ax = plt.subplots()
122
+ ax.plot(x, y)
123
+ ```
124
+
125
+ ### Popular Built-in Styles
126
+
127
+ - `default` - Matplotlib's default style
128
+ - `classic` - Classic matplotlib look (pre-2.0)
129
+ - `seaborn-v0_8-*` - Seaborn-inspired styles
130
+ - `seaborn-v0_8-darkgrid`, `seaborn-v0_8-whitegrid`
131
+ - `seaborn-v0_8-dark`, `seaborn-v0_8-white`
132
+ - `seaborn-v0_8-ticks`, `seaborn-v0_8-poster`, `seaborn-v0_8-talk`
133
+ - `ggplot` - ggplot2-inspired style
134
+ - `bmh` - Bayesian Methods for Hackers style
135
+ - `fivethirtyeight` - FiveThirtyEight style
136
+ - `grayscale` - Grayscale style
137
+
138
+ ### Creating Custom Style Sheets
139
+
140
+ Create a file named `custom_style.mplstyle`:
141
+
142
+ ```
143
+ # custom_style.mplstyle
144
+
145
+ # Figure
146
+ figure.figsize: 10, 6
147
+ figure.dpi: 100
148
+ figure.facecolor: white
149
+
150
+ # Font
151
+ font.family: sans-serif
152
+ font.sans-serif: Arial, Helvetica
153
+ font.size: 12
154
+
155
+ # Axes
156
+ axes.labelsize: 14
157
+ axes.titlesize: 16
158
+ axes.facecolor: white
159
+ axes.edgecolor: black
160
+ axes.linewidth: 1.5
161
+ axes.grid: True
162
+ axes.axisbelow: True
163
+
164
+ # Grid
165
+ grid.color: gray
166
+ grid.linestyle: --
167
+ grid.linewidth: 0.5
168
+ grid.alpha: 0.3
169
+
170
+ # Lines
171
+ lines.linewidth: 2
172
+ lines.markersize: 8
173
+
174
+ # Ticks
175
+ xtick.labelsize: 10
176
+ ytick.labelsize: 10
177
+ xtick.direction: in
178
+ ytick.direction: in
179
+ xtick.major.size: 6
180
+ ytick.major.size: 6
181
+ xtick.minor.size: 3
182
+ ytick.minor.size: 3
183
+
184
+ # Legend
185
+ legend.fontsize: 12
186
+ legend.frameon: True
187
+ legend.framealpha: 0.8
188
+ legend.fancybox: True
189
+
190
+ # Savefig
191
+ savefig.dpi: 300
192
+ savefig.bbox: tight
193
+ savefig.facecolor: white
194
+ ```
195
+
196
+ Load and use:
197
+ ```python
198
+ plt.style.use('path/to/custom_style.mplstyle')
199
+ ```
200
+
201
+ ## rcParams Configuration
202
+
203
+ ### Global Configuration
204
+
205
+ ```python
206
+ import matplotlib.pyplot as plt
207
+
208
+ # Configure globally
209
+ plt.rcParams['figure.figsize'] = (10, 6)
210
+ plt.rcParams['font.size'] = 12
211
+ plt.rcParams['axes.labelsize'] = 14
212
+
213
+ # Or update multiple at once
214
+ plt.rcParams.update({
215
+ 'figure.figsize': (10, 6),
216
+ 'font.size': 12,
217
+ 'axes.labelsize': 14,
218
+ 'axes.titlesize': 16,
219
+ 'lines.linewidth': 2
220
+ })
221
+ ```
222
+
223
+ ### Temporary Configuration
224
+
225
+ ```python
226
+ # Context manager for temporary changes
227
+ with plt.rc_context({'font.size': 14, 'lines.linewidth': 2.5}):
228
+ fig, ax = plt.subplots()
229
+ ax.plot(x, y)
230
+ ```
231
+
232
+ ### Common rcParams
233
+
234
+ **Figure settings:**
235
+ ```python
236
+ plt.rcParams['figure.figsize'] = (10, 6)
237
+ plt.rcParams['figure.dpi'] = 100
238
+ plt.rcParams['figure.facecolor'] = 'white'
239
+ plt.rcParams['figure.edgecolor'] = 'white'
240
+ plt.rcParams['figure.autolayout'] = False
241
+ plt.rcParams['figure.constrained_layout.use'] = True
242
+ ```
243
+
244
+ **Font settings:**
245
+ ```python
246
+ plt.rcParams['font.family'] = 'sans-serif'
247
+ plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
248
+ plt.rcParams['font.size'] = 12
249
+ plt.rcParams['font.weight'] = 'normal'
250
+ ```
251
+
252
+ **Axes settings:**
253
+ ```python
254
+ plt.rcParams['axes.facecolor'] = 'white'
255
+ plt.rcParams['axes.edgecolor'] = 'black'
256
+ plt.rcParams['axes.linewidth'] = 1.5
257
+ plt.rcParams['axes.grid'] = True
258
+ plt.rcParams['axes.labelsize'] = 14
259
+ plt.rcParams['axes.titlesize'] = 16
260
+ plt.rcParams['axes.labelweight'] = 'normal'
261
+ plt.rcParams['axes.spines.top'] = True
262
+ plt.rcParams['axes.spines.right'] = True
263
+ ```
264
+
265
+ **Line settings:**
266
+ ```python
267
+ plt.rcParams['lines.linewidth'] = 2
268
+ plt.rcParams['lines.linestyle'] = '-'
269
+ plt.rcParams['lines.marker'] = 'None'
270
+ plt.rcParams['lines.markersize'] = 6
271
+ ```
272
+
273
+ **Save settings:**
274
+ ```python
275
+ plt.rcParams['savefig.dpi'] = 300
276
+ plt.rcParams['savefig.format'] = 'png'
277
+ plt.rcParams['savefig.bbox'] = 'tight'
278
+ plt.rcParams['savefig.pad_inches'] = 0.1
279
+ plt.rcParams['savefig.transparent'] = False
280
+ ```
281
+
282
+ ## Color Palettes
283
+
284
+ ### Named Color Sets
285
+
286
+ ```python
287
+ # Tableau colors
288
+ tableau_colors = plt.cm.tab10.colors
289
+
290
+ # CSS4 colors (subset)
291
+ css_colors = ['steelblue', 'coral', 'teal', 'goldenrod', 'crimson']
292
+
293
+ # Manual definition
294
+ custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
295
+ ```
296
+
297
+ ### Color Cycles
298
+
299
+ ```python
300
+ # Set default color cycle
301
+ from cycler import cycler
302
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
303
+ plt.rcParams['axes.prop_cycle'] = cycler(color=colors)
304
+
305
+ # Or combine color and line style
306
+ plt.rcParams['axes.prop_cycle'] = cycler(color=colors) + cycler(linestyle=['-', '--', ':', '-.'])
307
+ ```
308
+
309
+ ### Palette Generation
310
+
311
+ ```python
312
+ # Evenly spaced colors from colormap
313
+ n_colors = 5
314
+ colors = plt.cm.viridis(np.linspace(0, 1, n_colors))
315
+
316
+ # Use in plot
317
+ for i, (x, y) in enumerate(data):
318
+ ax.plot(x, y, color=colors[i])
319
+ ```
320
+
321
+ ## Typography
322
+
323
+ ### Font Configuration
324
+
325
+ ```python
326
+ # Set font family
327
+ plt.rcParams['font.family'] = 'serif'
328
+ plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
329
+
330
+ # Or sans-serif
331
+ plt.rcParams['font.family'] = 'sans-serif'
332
+ plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica']
333
+
334
+ # Or monospace
335
+ plt.rcParams['font.family'] = 'monospace'
336
+ plt.rcParams['font.monospace'] = ['Courier New', 'DejaVu Sans Mono']
337
+ ```
338
+
339
+ ### Font Properties in Text
340
+
341
+ ```python
342
+ from matplotlib import font_manager
343
+
344
+ # Specify font properties
345
+ ax.text(x, y, 'Text',
346
+ fontsize=14,
347
+ fontweight='bold', # 'normal', 'bold', 'heavy', 'light'
348
+ fontstyle='italic', # 'normal', 'italic', 'oblique'
349
+ fontfamily='serif')
350
+
351
+ # Use specific font file
352
+ prop = font_manager.FontProperties(fname='path/to/font.ttf')
353
+ ax.text(x, y, 'Text', fontproperties=prop)
354
+ ```
355
+
356
+ ### Mathematical Text
357
+
358
+ ```python
359
+ # LaTeX-style math
360
+ ax.set_title(r'$\alpha > \beta$')
361
+ ax.set_xlabel(r'$\mu \pm \sigma$')
362
+ ax.text(x, y, r'$\int_0^\infty e^{-x} dx = 1$')
363
+
364
+ # Subscripts and superscripts
365
+ ax.set_ylabel(r'$y = x^2 + 2x + 1$')
366
+ ax.text(x, y, r'$x_1, x_2, \ldots, x_n$')
367
+
368
+ # Greek letters
369
+ ax.text(x, y, r'$\alpha, \beta, \gamma, \delta, \epsilon$')
370
+ ```
371
+
372
+ ### Using Full LaTeX
373
+
374
+ ```python
375
+ # Enable full LaTeX rendering (requires LaTeX installation)
376
+ plt.rcParams['text.usetex'] = True
377
+ plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'
378
+
379
+ ax.set_title(r'\textbf{Bold Title}')
380
+ ax.set_xlabel(r'Time $t$ (s)')
381
+ ```
382
+
383
+ ## Spines and Grids
384
+
385
+ ### Spine Customization
386
+
387
+ ```python
388
+ # Hide specific spines
389
+ ax.spines['top'].set_visible(False)
390
+ ax.spines['right'].set_visible(False)
391
+
392
+ # Move spine position
393
+ ax.spines['left'].set_position(('outward', 10))
394
+ ax.spines['bottom'].set_position(('data', 0))
395
+
396
+ # Change spine color and width
397
+ ax.spines['left'].set_color('red')
398
+ ax.spines['bottom'].set_linewidth(2)
399
+ ```
400
+
401
+ ### Grid Customization
402
+
403
+ ```python
404
+ # Basic grid
405
+ ax.grid(True)
406
+
407
+ # Customized grid
408
+ ax.grid(True, which='major', linestyle='--', linewidth=0.8, alpha=0.3)
409
+ ax.grid(True, which='minor', linestyle=':', linewidth=0.5, alpha=0.2)
410
+
411
+ # Grid for specific axis
412
+ ax.grid(True, axis='x') # Only vertical lines
413
+ ax.grid(True, axis='y') # Only horizontal lines
414
+
415
+ # Grid behind or in front of data
416
+ ax.set_axisbelow(True) # Grid behind data
417
+ ```
418
+
419
+ ## Legend Customization
420
+
421
+ ### Legend Positioning
422
+
423
+ ```python
424
+ # Location strings
425
+ ax.legend(loc='best') # Automatic best position
426
+ ax.legend(loc='upper right')
427
+ ax.legend(loc='upper left')
428
+ ax.legend(loc='lower right')
429
+ ax.legend(loc='lower left')
430
+ ax.legend(loc='center')
431
+ ax.legend(loc='upper center')
432
+ ax.legend(loc='lower center')
433
+ ax.legend(loc='center left')
434
+ ax.legend(loc='center right')
435
+
436
+ # Precise positioning (bbox_to_anchor)
437
+ ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Outside plot area
438
+ ax.legend(bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3) # Below plot
439
+ ```
440
+
441
+ ### Legend Styling
442
+
443
+ ```python
444
+ ax.legend(
445
+ fontsize=12,
446
+ frameon=True, # Show frame
447
+ framealpha=0.9, # Frame transparency
448
+ fancybox=True, # Rounded corners
449
+ shadow=True, # Shadow effect
450
+ ncol=2, # Number of columns
451
+ title='Legend Title', # Legend title
452
+ title_fontsize=14, # Title font size
453
+ edgecolor='black', # Frame edge color
454
+ facecolor='white' # Frame background color
455
+ )
456
+ ```
457
+
458
+ ### Custom Legend Entries
459
+
460
+ ```python
461
+ from matplotlib.lines import Line2D
462
+
463
+ # Create custom legend handles
464
+ custom_lines = [Line2D([0], [0], color='red', lw=2),
465
+ Line2D([0], [0], color='blue', lw=2, linestyle='--'),
466
+ Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10)]
467
+
468
+ ax.legend(custom_lines, ['Label 1', 'Label 2', 'Label 3'])
469
+ ```
470
+
471
+ ## Layout and Spacing
472
+
473
+ ### Constrained Layout
474
+
475
+ ```python
476
+ # Preferred method (automatic adjustment)
477
+ fig, axes = plt.subplots(2, 2, constrained_layout=True)
478
+ ```
479
+
480
+ ### Tight Layout
481
+
482
+ ```python
483
+ # Alternative method
484
+ fig, axes = plt.subplots(2, 2)
485
+ plt.tight_layout(pad=1.5, h_pad=2.0, w_pad=2.0)
486
+ ```
487
+
488
+ ### Manual Adjustment
489
+
490
+ ```python
491
+ # Fine-grained control
492
+ plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1,
493
+ hspace=0.3, wspace=0.4)
494
+ ```
495
+
496
+ ## Professional Publication Style
497
+
498
+ Example configuration for publication-quality figures:
499
+
500
+ ```python
501
+ # Publication style configuration
502
+ plt.rcParams.update({
503
+ # Figure
504
+ 'figure.figsize': (8, 6),
505
+ 'figure.dpi': 100,
506
+ 'savefig.dpi': 300,
507
+ 'savefig.bbox': 'tight',
508
+ 'savefig.pad_inches': 0.1,
509
+
510
+ # Font
511
+ 'font.family': 'sans-serif',
512
+ 'font.sans-serif': ['Arial', 'Helvetica'],
513
+ 'font.size': 11,
514
+
515
+ # Axes
516
+ 'axes.labelsize': 12,
517
+ 'axes.titlesize': 14,
518
+ 'axes.linewidth': 1.5,
519
+ 'axes.grid': False,
520
+ 'axes.spines.top': False,
521
+ 'axes.spines.right': False,
522
+
523
+ # Lines
524
+ 'lines.linewidth': 2,
525
+ 'lines.markersize': 8,
526
+
527
+ # Ticks
528
+ 'xtick.labelsize': 10,
529
+ 'ytick.labelsize': 10,
530
+ 'xtick.major.size': 6,
531
+ 'ytick.major.size': 6,
532
+ 'xtick.major.width': 1.5,
533
+ 'ytick.major.width': 1.5,
534
+ 'xtick.direction': 'in',
535
+ 'ytick.direction': 'in',
536
+
537
+ # Legend
538
+ 'legend.fontsize': 10,
539
+ 'legend.frameon': True,
540
+ 'legend.framealpha': 1.0,
541
+ 'legend.edgecolor': 'black'
542
+ })
543
+ ```
544
+
545
+ ## Dark Theme
546
+
547
+ ```python
548
+ # Dark background style
549
+ plt.style.use('dark_background')
550
+
551
+ # Or manual configuration
552
+ plt.rcParams.update({
553
+ 'figure.facecolor': '#1e1e1e',
554
+ 'axes.facecolor': '#1e1e1e',
555
+ 'axes.edgecolor': 'white',
556
+ 'axes.labelcolor': 'white',
557
+ 'text.color': 'white',
558
+ 'xtick.color': 'white',
559
+ 'ytick.color': 'white',
560
+ 'grid.color': 'gray',
561
+ 'legend.facecolor': '#1e1e1e',
562
+ 'legend.edgecolor': 'white'
563
+ })
564
+ ```
565
+
566
+ ## Color Accessibility
567
+
568
+ ### Colorblind-Friendly Palettes
569
+
570
+ ```python
571
+ # Use colorblind-friendly colormaps
572
+ colorblind_friendly = ['viridis', 'plasma', 'cividis']
573
+
574
+ # Colorblind-friendly discrete colors
575
+ cb_colors = ['#0173B2', '#DE8F05', '#029E73', '#CC78BC',
576
+ '#CA9161', '#949494', '#ECE133', '#56B4E9']
577
+
578
+ # Test with simulation tools or use these validated palettes
579
+ ```
580
+
581
+ ### High Contrast
582
+
583
+ ```python
584
+ # Ensure sufficient contrast
585
+ plt.rcParams['axes.edgecolor'] = 'black'
586
+ plt.rcParams['axes.linewidth'] = 2
587
+ plt.rcParams['xtick.major.width'] = 2
588
+ plt.rcParams['ytick.major.width'] = 2
589
+ ```
.scider/skills/matplotlib/scripts/plot_template.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Matplotlib Plot Template
4
+
5
+ Comprehensive template demonstrating various plot types and best practices.
6
+ Use this as a starting point for creating publication-quality visualizations.
7
+
8
+ Usage:
9
+ python plot_template.py [--plot-type TYPE] [--style STYLE] [--output FILE]
10
+
11
+ Plot types:
12
+ line, scatter, bar, histogram, heatmap, contour, box, violin, 3d, all
13
+ """
14
+
15
+ import argparse
16
+
17
+ import matplotlib.pyplot as plt
18
+ import numpy as np
19
+ from matplotlib.gridspec import GridSpec
20
+
21
+
22
+ def set_publication_style():
23
+ """Configure matplotlib for publication-quality figures."""
24
+ plt.rcParams.update(
25
+ {
26
+ "figure.figsize": (10, 6),
27
+ "figure.dpi": 100,
28
+ "savefig.dpi": 300,
29
+ "savefig.bbox": "tight",
30
+ "font.size": 11,
31
+ "axes.labelsize": 12,
32
+ "axes.titlesize": 14,
33
+ "xtick.labelsize": 10,
34
+ "ytick.labelsize": 10,
35
+ "legend.fontsize": 10,
36
+ "lines.linewidth": 2,
37
+ "axes.linewidth": 1.5,
38
+ }
39
+ )
40
+
41
+
42
+ def generate_sample_data():
43
+ """Generate sample data for demonstrations."""
44
+ np.random.seed(42)
45
+ x = np.linspace(0, 10, 100)
46
+ y1 = np.sin(x)
47
+ y2 = np.cos(x)
48
+ scatter_x = np.random.randn(200)
49
+ scatter_y = np.random.randn(200)
50
+ categories = ["A", "B", "C", "D", "E"]
51
+ bar_values = np.random.randint(10, 100, len(categories))
52
+ hist_data = np.random.normal(0, 1, 1000)
53
+ matrix = np.random.rand(10, 10)
54
+
55
+ X, Y = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
56
+ Z = np.sin(np.sqrt(X**2 + Y**2))
57
+
58
+ return {
59
+ "x": x,
60
+ "y1": y1,
61
+ "y2": y2,
62
+ "scatter_x": scatter_x,
63
+ "scatter_y": scatter_y,
64
+ "categories": categories,
65
+ "bar_values": bar_values,
66
+ "hist_data": hist_data,
67
+ "matrix": matrix,
68
+ "X": X,
69
+ "Y": Y,
70
+ "Z": Z,
71
+ }
72
+
73
+
74
+ def create_line_plot(data, ax=None):
75
+ """Create line plot with best practices."""
76
+ if ax is None:
77
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
78
+
79
+ ax.plot(
80
+ data["x"], data["y1"], label="sin(x)", linewidth=2, marker="o", markevery=10, markersize=6
81
+ )
82
+ ax.plot(data["x"], data["y2"], label="cos(x)", linewidth=2, linestyle="--")
83
+
84
+ ax.set_xlabel("x")
85
+ ax.set_ylabel("y")
86
+ ax.set_title("Line Plot Example")
87
+ ax.legend(loc="best", framealpha=0.9)
88
+ ax.grid(True, alpha=0.3, linestyle="--")
89
+
90
+ # Remove top and right spines for cleaner look
91
+ ax.spines["top"].set_visible(False)
92
+ ax.spines["right"].set_visible(False)
93
+
94
+ if ax is None:
95
+ return fig
96
+ return ax
97
+
98
+
99
+ def create_scatter_plot(data, ax=None):
100
+ """Create scatter plot with color and size variations."""
101
+ if ax is None:
102
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
103
+
104
+ # Color based on distance from origin
105
+ colors = np.sqrt(data["scatter_x"] ** 2 + data["scatter_y"] ** 2)
106
+ sizes = 50 * (1 + np.abs(data["scatter_x"]))
107
+
108
+ scatter = ax.scatter(
109
+ data["scatter_x"],
110
+ data["scatter_y"],
111
+ c=colors,
112
+ s=sizes,
113
+ alpha=0.6,
114
+ cmap="viridis",
115
+ edgecolors="black",
116
+ linewidth=0.5,
117
+ )
118
+
119
+ ax.set_xlabel("X")
120
+ ax.set_ylabel("Y")
121
+ ax.set_title("Scatter Plot Example")
122
+ ax.grid(True, alpha=0.3, linestyle="--")
123
+
124
+ # Add colorbar
125
+ cbar = plt.colorbar(scatter, ax=ax)
126
+ cbar.set_label("Distance from origin")
127
+
128
+ if ax is None:
129
+ return fig
130
+ return ax
131
+
132
+
133
+ def create_bar_chart(data, ax=None):
134
+ """Create bar chart with error bars and styling."""
135
+ if ax is None:
136
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
137
+
138
+ x_pos = np.arange(len(data["categories"]))
139
+ errors = np.random.randint(5, 15, len(data["categories"]))
140
+
141
+ bars = ax.bar(
142
+ x_pos,
143
+ data["bar_values"],
144
+ yerr=errors,
145
+ color="steelblue",
146
+ edgecolor="black",
147
+ linewidth=1.5,
148
+ capsize=5,
149
+ alpha=0.8,
150
+ )
151
+
152
+ # Color bars by value
153
+ colors = plt.cm.viridis(data["bar_values"] / data["bar_values"].max())
154
+ for bar, color in zip(bars, colors):
155
+ bar.set_facecolor(color)
156
+
157
+ ax.set_xlabel("Category")
158
+ ax.set_ylabel("Values")
159
+ ax.set_title("Bar Chart Example")
160
+ ax.set_xticks(x_pos)
161
+ ax.set_xticklabels(data["categories"])
162
+ ax.grid(True, axis="y", alpha=0.3, linestyle="--")
163
+
164
+ # Remove top and right spines
165
+ ax.spines["top"].set_visible(False)
166
+ ax.spines["right"].set_visible(False)
167
+
168
+ if ax is None:
169
+ return fig
170
+ return ax
171
+
172
+
173
+ def create_histogram(data, ax=None):
174
+ """Create histogram with density overlay."""
175
+ if ax is None:
176
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
177
+
178
+ n, bins, patches = ax.hist(
179
+ data["hist_data"], bins=30, density=True, alpha=0.7, edgecolor="black", color="steelblue"
180
+ )
181
+
182
+ # Overlay theoretical normal distribution
183
+ from scipy.stats import norm
184
+
185
+ mu, std = norm.fit(data["hist_data"])
186
+ x_theory = np.linspace(data["hist_data"].min(), data["hist_data"].max(), 100)
187
+ ax.plot(
188
+ x_theory,
189
+ norm.pdf(x_theory, mu, std),
190
+ "r-",
191
+ linewidth=2,
192
+ label=f"Normal fit (μ={mu:.2f}, σ={std:.2f})",
193
+ )
194
+
195
+ ax.set_xlabel("Value")
196
+ ax.set_ylabel("Density")
197
+ ax.set_title("Histogram with Normal Fit")
198
+ ax.legend()
199
+ ax.grid(True, axis="y", alpha=0.3, linestyle="--")
200
+
201
+ if ax is None:
202
+ return fig
203
+ return ax
204
+
205
+
206
+ def create_heatmap(data, ax=None):
207
+ """Create heatmap with colorbar and annotations."""
208
+ if ax is None:
209
+ fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
210
+
211
+ im = ax.imshow(data["matrix"], cmap="coolwarm", aspect="auto", vmin=0, vmax=1)
212
+
213
+ # Add colorbar
214
+ cbar = plt.colorbar(im, ax=ax)
215
+ cbar.set_label("Value")
216
+
217
+ # Optional: Add text annotations
218
+ # for i in range(data['matrix'].shape[0]):
219
+ # for j in range(data['matrix'].shape[1]):
220
+ # text = ax.text(j, i, f'{data["matrix"][i, j]:.2f}',
221
+ # ha='center', va='center', color='black', fontsize=8)
222
+
223
+ ax.set_xlabel("X Index")
224
+ ax.set_ylabel("Y Index")
225
+ ax.set_title("Heatmap Example")
226
+
227
+ if ax is None:
228
+ return fig
229
+ return ax
230
+
231
+
232
+ def create_contour_plot(data, ax=None):
233
+ """Create contour plot with filled contours and labels."""
234
+ if ax is None:
235
+ fig, ax = plt.subplots(figsize=(10, 8), constrained_layout=True)
236
+
237
+ # Filled contours
238
+ contourf = ax.contourf(data["X"], data["Y"], data["Z"], levels=20, cmap="viridis", alpha=0.8)
239
+
240
+ # Contour lines
241
+ contour = ax.contour(
242
+ data["X"], data["Y"], data["Z"], levels=10, colors="black", linewidths=0.5, alpha=0.4
243
+ )
244
+
245
+ # Add labels to contour lines
246
+ ax.clabel(contour, inline=True, fontsize=8)
247
+
248
+ # Add colorbar
249
+ cbar = plt.colorbar(contourf, ax=ax)
250
+ cbar.set_label("Z value")
251
+
252
+ ax.set_xlabel("X")
253
+ ax.set_ylabel("Y")
254
+ ax.set_title("Contour Plot Example")
255
+ ax.set_aspect("equal")
256
+
257
+ if ax is None:
258
+ return fig
259
+ return ax
260
+
261
+
262
+ def create_box_plot(data, ax=None):
263
+ """Create box plot comparing distributions."""
264
+ if ax is None:
265
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
266
+
267
+ # Generate multiple distributions
268
+ box_data = [np.random.normal(0, std, 100) for std in range(1, 5)]
269
+
270
+ bp = ax.boxplot(
271
+ box_data,
272
+ labels=["Group 1", "Group 2", "Group 3", "Group 4"],
273
+ patch_artist=True,
274
+ showmeans=True,
275
+ boxprops=dict(facecolor="lightblue", edgecolor="black"),
276
+ medianprops=dict(color="red", linewidth=2),
277
+ meanprops=dict(marker="D", markerfacecolor="green", markersize=8),
278
+ )
279
+
280
+ ax.set_xlabel("Groups")
281
+ ax.set_ylabel("Values")
282
+ ax.set_title("Box Plot Example")
283
+ ax.grid(True, axis="y", alpha=0.3, linestyle="--")
284
+
285
+ if ax is None:
286
+ return fig
287
+ return ax
288
+
289
+
290
+ def create_violin_plot(data, ax=None):
291
+ """Create violin plot showing distribution shapes."""
292
+ if ax is None:
293
+ fig, ax = plt.subplots(figsize=(10, 6), constrained_layout=True)
294
+
295
+ # Generate multiple distributions
296
+ violin_data = [np.random.normal(0, std, 100) for std in range(1, 5)]
297
+
298
+ parts = ax.violinplot(violin_data, positions=range(1, 5), showmeans=True, showmedians=True)
299
+
300
+ # Customize colors
301
+ for pc in parts["bodies"]:
302
+ pc.set_facecolor("lightblue")
303
+ pc.set_alpha(0.7)
304
+ pc.set_edgecolor("black")
305
+
306
+ ax.set_xlabel("Groups")
307
+ ax.set_ylabel("Values")
308
+ ax.set_title("Violin Plot Example")
309
+ ax.set_xticks(range(1, 5))
310
+ ax.set_xticklabels(["Group 1", "Group 2", "Group 3", "Group 4"])
311
+ ax.grid(True, axis="y", alpha=0.3, linestyle="--")
312
+
313
+ if ax is None:
314
+ return fig
315
+ return ax
316
+
317
+
318
+ def create_3d_plot():
319
+ """Create 3D surface plot."""
320
+ from mpl_toolkits.mplot3d import Axes3D
321
+
322
+ fig = plt.figure(figsize=(12, 9))
323
+ ax = fig.add_subplot(111, projection="3d")
324
+
325
+ # Generate data
326
+ X = np.linspace(-5, 5, 50)
327
+ Y = np.linspace(-5, 5, 50)
328
+ X, Y = np.meshgrid(X, Y)
329
+ Z = np.sin(np.sqrt(X**2 + Y**2))
330
+
331
+ # Create surface plot
332
+ surf = ax.plot_surface(X, Y, Z, cmap="viridis", edgecolor="none", alpha=0.9)
333
+
334
+ # Add colorbar
335
+ fig.colorbar(surf, ax=ax, shrink=0.5)
336
+
337
+ ax.set_xlabel("X")
338
+ ax.set_ylabel("Y")
339
+ ax.set_zlabel("Z")
340
+ ax.set_title("3D Surface Plot Example")
341
+
342
+ # Set viewing angle
343
+ ax.view_init(elev=30, azim=45)
344
+
345
+ plt.tight_layout()
346
+ return fig
347
+
348
+
349
+ def create_comprehensive_figure():
350
+ """Create a comprehensive figure with multiple subplots."""
351
+ data = generate_sample_data()
352
+
353
+ fig = plt.figure(figsize=(16, 12), constrained_layout=True)
354
+ gs = GridSpec(3, 3, figure=fig)
355
+
356
+ # Create subplots
357
+ ax1 = fig.add_subplot(gs[0, :2]) # Line plot - top left, spans 2 columns
358
+ create_line_plot(data, ax1)
359
+
360
+ ax2 = fig.add_subplot(gs[0, 2]) # Bar chart - top right
361
+ create_bar_chart(data, ax2)
362
+
363
+ ax3 = fig.add_subplot(gs[1, 0]) # Scatter plot - middle left
364
+ create_scatter_plot(data, ax3)
365
+
366
+ ax4 = fig.add_subplot(gs[1, 1]) # Histogram - middle center
367
+ create_histogram(data, ax4)
368
+
369
+ ax5 = fig.add_subplot(gs[1, 2]) # Box plot - middle right
370
+ create_box_plot(data, ax5)
371
+
372
+ ax6 = fig.add_subplot(gs[2, :2]) # Contour plot - bottom left, spans 2 columns
373
+ create_contour_plot(data, ax6)
374
+
375
+ ax7 = fig.add_subplot(gs[2, 2]) # Heatmap - bottom right
376
+ create_heatmap(data, ax7)
377
+
378
+ fig.suptitle("Comprehensive Matplotlib Template", fontsize=18, fontweight="bold")
379
+
380
+ return fig
381
+
382
+
383
+ def main():
384
+ """Main function to run the template."""
385
+ parser = argparse.ArgumentParser(description="Matplotlib plot template")
386
+ parser.add_argument(
387
+ "--plot-type",
388
+ type=str,
389
+ default="all",
390
+ choices=[
391
+ "line",
392
+ "scatter",
393
+ "bar",
394
+ "histogram",
395
+ "heatmap",
396
+ "contour",
397
+ "box",
398
+ "violin",
399
+ "3d",
400
+ "all",
401
+ ],
402
+ help="Type of plot to create",
403
+ )
404
+ parser.add_argument("--style", type=str, default="default", help="Matplotlib style to use")
405
+ parser.add_argument("--output", type=str, default="plot.png", help="Output filename")
406
+
407
+ args = parser.parse_args()
408
+
409
+ # Set style
410
+ if args.style != "default":
411
+ plt.style.use(args.style)
412
+ else:
413
+ set_publication_style()
414
+
415
+ # Generate data
416
+ data = generate_sample_data()
417
+
418
+ # Create plot based on type
419
+ plot_functions = {
420
+ "line": create_line_plot,
421
+ "scatter": create_scatter_plot,
422
+ "bar": create_bar_chart,
423
+ "histogram": create_histogram,
424
+ "heatmap": create_heatmap,
425
+ "contour": create_contour_plot,
426
+ "box": create_box_plot,
427
+ "violin": create_violin_plot,
428
+ }
429
+
430
+ if args.plot_type == "3d":
431
+ fig = create_3d_plot()
432
+ elif args.plot_type == "all":
433
+ fig = create_comprehensive_figure()
434
+ else:
435
+ fig = plot_functions[args.plot_type](data)
436
+
437
+ # Save figure
438
+ plt.savefig(args.output, dpi=300, bbox_inches="tight")
439
+ print(f"Plot saved to {args.output}")
440
+
441
+ # Display
442
+ plt.show()
443
+
444
+
445
+ if __name__ == "__main__":
446
+ main()
.scider/skills/matplotlib/scripts/style_configurator.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Matplotlib Style Configurator
4
+
5
+ Interactive utility to configure matplotlib style preferences and generate
6
+ custom style sheets. Creates a preview of the style and optionally saves
7
+ it as a .mplstyle file.
8
+
9
+ Usage:
10
+ python style_configurator.py [--preset PRESET] [--output FILE] [--preview]
11
+
12
+ Presets:
13
+ publication, presentation, web, dark, minimal
14
+ """
15
+
16
+ import argparse
17
+ import os
18
+
19
+ import matplotlib.pyplot as plt
20
+ import numpy as np
21
+ from matplotlib.gridspec import GridSpec
22
+
23
+ # Predefined style presets
24
+ STYLE_PRESETS = {
25
+ "publication": {
26
+ "figure.figsize": (8, 6),
27
+ "figure.dpi": 100,
28
+ "savefig.dpi": 300,
29
+ "savefig.bbox": "tight",
30
+ "font.family": "sans-serif",
31
+ "font.sans-serif": ["Arial", "Helvetica"],
32
+ "font.size": 11,
33
+ "axes.labelsize": 12,
34
+ "axes.titlesize": 14,
35
+ "axes.linewidth": 1.5,
36
+ "axes.grid": False,
37
+ "axes.spines.top": False,
38
+ "axes.spines.right": False,
39
+ "lines.linewidth": 2,
40
+ "lines.markersize": 8,
41
+ "xtick.labelsize": 10,
42
+ "ytick.labelsize": 10,
43
+ "xtick.direction": "in",
44
+ "ytick.direction": "in",
45
+ "xtick.major.size": 6,
46
+ "ytick.major.size": 6,
47
+ "xtick.major.width": 1.5,
48
+ "ytick.major.width": 1.5,
49
+ "legend.fontsize": 10,
50
+ "legend.frameon": True,
51
+ "legend.framealpha": 1.0,
52
+ "legend.edgecolor": "black",
53
+ },
54
+ "presentation": {
55
+ "figure.figsize": (12, 8),
56
+ "figure.dpi": 100,
57
+ "savefig.dpi": 150,
58
+ "font.size": 16,
59
+ "axes.labelsize": 20,
60
+ "axes.titlesize": 24,
61
+ "axes.linewidth": 2,
62
+ "lines.linewidth": 3,
63
+ "lines.markersize": 12,
64
+ "xtick.labelsize": 16,
65
+ "ytick.labelsize": 16,
66
+ "legend.fontsize": 16,
67
+ "axes.grid": True,
68
+ "grid.alpha": 0.3,
69
+ },
70
+ "web": {
71
+ "figure.figsize": (10, 6),
72
+ "figure.dpi": 96,
73
+ "savefig.dpi": 150,
74
+ "font.size": 11,
75
+ "axes.labelsize": 12,
76
+ "axes.titlesize": 14,
77
+ "lines.linewidth": 2,
78
+ "axes.grid": True,
79
+ "grid.alpha": 0.2,
80
+ "grid.linestyle": "--",
81
+ },
82
+ "dark": {
83
+ "figure.facecolor": "#1e1e1e",
84
+ "figure.edgecolor": "#1e1e1e",
85
+ "axes.facecolor": "#1e1e1e",
86
+ "axes.edgecolor": "white",
87
+ "axes.labelcolor": "white",
88
+ "text.color": "white",
89
+ "xtick.color": "white",
90
+ "ytick.color": "white",
91
+ "grid.color": "gray",
92
+ "grid.alpha": 0.3,
93
+ "axes.grid": True,
94
+ "legend.facecolor": "#1e1e1e",
95
+ "legend.edgecolor": "white",
96
+ "savefig.facecolor": "#1e1e1e",
97
+ },
98
+ "minimal": {
99
+ "figure.figsize": (10, 6),
100
+ "axes.spines.top": False,
101
+ "axes.spines.right": False,
102
+ "axes.spines.left": False,
103
+ "axes.spines.bottom": False,
104
+ "axes.grid": False,
105
+ "xtick.bottom": True,
106
+ "ytick.left": True,
107
+ "axes.axisbelow": True,
108
+ "lines.linewidth": 2.5,
109
+ "font.size": 12,
110
+ },
111
+ }
112
+
113
+
114
+ def generate_preview_data():
115
+ """Generate sample data for style preview."""
116
+ np.random.seed(42)
117
+ x = np.linspace(0, 10, 100)
118
+ y1 = np.sin(x) + 0.1 * np.random.randn(100)
119
+ y2 = np.cos(x) + 0.1 * np.random.randn(100)
120
+ scatter_x = np.random.randn(100)
121
+ scatter_y = 2 * scatter_x + np.random.randn(100)
122
+ categories = ["A", "B", "C", "D", "E"]
123
+ bar_values = [25, 40, 30, 55, 45]
124
+
125
+ return {
126
+ "x": x,
127
+ "y1": y1,
128
+ "y2": y2,
129
+ "scatter_x": scatter_x,
130
+ "scatter_y": scatter_y,
131
+ "categories": categories,
132
+ "bar_values": bar_values,
133
+ }
134
+
135
+
136
+ def create_style_preview(style_dict=None):
137
+ """Create a preview figure demonstrating the style."""
138
+ if style_dict:
139
+ plt.rcParams.update(style_dict)
140
+
141
+ data = generate_preview_data()
142
+
143
+ fig = plt.figure(figsize=(14, 10))
144
+ gs = GridSpec(2, 2, figure=fig, hspace=0.3, wspace=0.3)
145
+
146
+ # Line plot
147
+ ax1 = fig.add_subplot(gs[0, 0])
148
+ ax1.plot(data["x"], data["y1"], label="sin(x)", marker="o", markevery=10)
149
+ ax1.plot(data["x"], data["y2"], label="cos(x)", linestyle="--")
150
+ ax1.set_xlabel("X axis")
151
+ ax1.set_ylabel("Y axis")
152
+ ax1.set_title("Line Plot")
153
+ ax1.legend()
154
+ ax1.grid(True, alpha=0.3)
155
+
156
+ # Scatter plot
157
+ ax2 = fig.add_subplot(gs[0, 1])
158
+ colors = np.sqrt(data["scatter_x"] ** 2 + data["scatter_y"] ** 2)
159
+ scatter = ax2.scatter(
160
+ data["scatter_x"], data["scatter_y"], c=colors, cmap="viridis", alpha=0.6, s=50
161
+ )
162
+ ax2.set_xlabel("X axis")
163
+ ax2.set_ylabel("Y axis")
164
+ ax2.set_title("Scatter Plot")
165
+ cbar = plt.colorbar(scatter, ax=ax2)
166
+ cbar.set_label("Distance")
167
+ ax2.grid(True, alpha=0.3)
168
+
169
+ # Bar chart
170
+ ax3 = fig.add_subplot(gs[1, 0])
171
+ bars = ax3.bar(data["categories"], data["bar_values"], edgecolor="black", linewidth=1)
172
+ # Color bars with gradient
173
+ colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(bars)))
174
+ for bar, color in zip(bars, colors):
175
+ bar.set_facecolor(color)
176
+ ax3.set_xlabel("Categories")
177
+ ax3.set_ylabel("Values")
178
+ ax3.set_title("Bar Chart")
179
+ ax3.grid(True, axis="y", alpha=0.3)
180
+
181
+ # Multiple line plot with fills
182
+ ax4 = fig.add_subplot(gs[1, 1])
183
+ ax4.plot(data["x"], data["y1"], label="Signal 1", linewidth=2)
184
+ ax4.fill_between(data["x"], data["y1"] - 0.2, data["y1"] + 0.2, alpha=0.3, label="±1 std")
185
+ ax4.plot(data["x"], data["y2"], label="Signal 2", linewidth=2)
186
+ ax4.fill_between(data["x"], data["y2"] - 0.2, data["y2"] + 0.2, alpha=0.3)
187
+ ax4.set_xlabel("X axis")
188
+ ax4.set_ylabel("Y axis")
189
+ ax4.set_title("Time Series with Uncertainty")
190
+ ax4.legend()
191
+ ax4.grid(True, alpha=0.3)
192
+
193
+ fig.suptitle("Style Preview", fontsize=16, fontweight="bold")
194
+
195
+ return fig
196
+
197
+
198
+ def save_style_file(style_dict, filename):
199
+ """Save style dictionary as .mplstyle file."""
200
+ with open(filename, "w") as f:
201
+ f.write("# Custom matplotlib style\n")
202
+ f.write("# Generated by style_configurator.py\n\n")
203
+
204
+ # Group settings by category
205
+ categories = {
206
+ "Figure": ["figure."],
207
+ "Font": ["font."],
208
+ "Axes": ["axes."],
209
+ "Lines": ["lines."],
210
+ "Markers": ["markers."],
211
+ "Ticks": ["tick.", "xtick.", "ytick."],
212
+ "Grid": ["grid."],
213
+ "Legend": ["legend."],
214
+ "Savefig": ["savefig."],
215
+ "Text": ["text."],
216
+ }
217
+
218
+ for category, prefixes in categories.items():
219
+ category_items = {
220
+ k: v for k, v in style_dict.items() if any(k.startswith(p) for p in prefixes)
221
+ }
222
+ if category_items:
223
+ f.write(f"# {category}\n")
224
+ for key, value in sorted(category_items.items()):
225
+ # Format value appropriately
226
+ if isinstance(value, (list, tuple)):
227
+ value_str = ", ".join(str(v) for v in value)
228
+ elif isinstance(value, bool):
229
+ value_str = str(value)
230
+ else:
231
+ value_str = str(value)
232
+ f.write(f"{key}: {value_str}\n")
233
+ f.write("\n")
234
+
235
+ print(f"Style saved to {filename}")
236
+
237
+
238
+ def print_style_info(style_dict):
239
+ """Print information about the style."""
240
+ print("\n" + "=" * 60)
241
+ print("STYLE CONFIGURATION")
242
+ print("=" * 60)
243
+
244
+ categories = {
245
+ "Figure Settings": ["figure."],
246
+ "Font Settings": ["font."],
247
+ "Axes Settings": ["axes."],
248
+ "Line Settings": ["lines."],
249
+ "Grid Settings": ["grid."],
250
+ "Legend Settings": ["legend."],
251
+ }
252
+
253
+ for category, prefixes in categories.items():
254
+ category_items = {
255
+ k: v for k, v in style_dict.items() if any(k.startswith(p) for p in prefixes)
256
+ }
257
+ if category_items:
258
+ print(f"\n{category}:")
259
+ for key, value in sorted(category_items.items()):
260
+ print(f" {key}: {value}")
261
+
262
+ print("\n" + "=" * 60 + "\n")
263
+
264
+
265
+ def list_available_presets():
266
+ """Print available style presets."""
267
+ print("\nAvailable style presets:")
268
+ print("-" * 40)
269
+ descriptions = {
270
+ "publication": "Optimized for academic publications",
271
+ "presentation": "Large fonts for presentations",
272
+ "web": "Optimized for web display",
273
+ "dark": "Dark background theme",
274
+ "minimal": "Minimal, clean style",
275
+ }
276
+ for preset, desc in descriptions.items():
277
+ print(f" {preset:15s} - {desc}")
278
+ print("-" * 40 + "\n")
279
+
280
+
281
+ def interactive_mode():
282
+ """Run interactive mode to customize style settings."""
283
+ print("\n" + "=" * 60)
284
+ print("MATPLOTLIB STYLE CONFIGURATOR - Interactive Mode")
285
+ print("=" * 60)
286
+
287
+ list_available_presets()
288
+
289
+ preset = input("Choose a preset to start from (or 'custom' for default): ").strip().lower()
290
+
291
+ if preset in STYLE_PRESETS:
292
+ style_dict = STYLE_PRESETS[preset].copy()
293
+ print(f"\nStarting from '{preset}' preset")
294
+ else:
295
+ style_dict = {}
296
+ print("\nStarting from default matplotlib style")
297
+
298
+ print("\nCommon settings you might want to customize:")
299
+ print(" 1. Figure size")
300
+ print(" 2. Font sizes")
301
+ print(" 3. Line widths")
302
+ print(" 4. Grid settings")
303
+ print(" 5. Color scheme")
304
+ print(" 6. Done, show preview")
305
+
306
+ while True:
307
+ choice = input("\nSelect option (1-6): ").strip()
308
+
309
+ if choice == "1":
310
+ width = input(" Figure width (inches, default 10): ").strip() or "10"
311
+ height = input(" Figure height (inches, default 6): ").strip() or "6"
312
+ style_dict["figure.figsize"] = (float(width), float(height))
313
+
314
+ elif choice == "2":
315
+ base = input(" Base font size (default 12): ").strip() or "12"
316
+ style_dict["font.size"] = float(base)
317
+ style_dict["axes.labelsize"] = float(base) + 2
318
+ style_dict["axes.titlesize"] = float(base) + 4
319
+
320
+ elif choice == "3":
321
+ lw = input(" Line width (default 2): ").strip() or "2"
322
+ style_dict["lines.linewidth"] = float(lw)
323
+
324
+ elif choice == "4":
325
+ grid = input(" Enable grid? (y/n): ").strip().lower()
326
+ style_dict["axes.grid"] = grid == "y"
327
+ if style_dict["axes.grid"]:
328
+ alpha = input(" Grid transparency (0-1, default 0.3): ").strip() or "0.3"
329
+ style_dict["grid.alpha"] = float(alpha)
330
+
331
+ elif choice == "5":
332
+ print(" Theme options: 1=Light, 2=Dark")
333
+ theme = input(" Select theme (1-2): ").strip()
334
+ if theme == "2":
335
+ style_dict.update(STYLE_PRESETS["dark"])
336
+
337
+ elif choice == "6":
338
+ break
339
+
340
+ return style_dict
341
+
342
+
343
+ def main():
344
+ """Main function."""
345
+ parser = argparse.ArgumentParser(
346
+ description="Matplotlib style configurator",
347
+ formatter_class=argparse.RawDescriptionHelpFormatter,
348
+ epilog="""
349
+ Examples:
350
+ # Show available presets
351
+ python style_configurator.py --list
352
+
353
+ # Preview a preset
354
+ python style_configurator.py --preset publication --preview
355
+
356
+ # Save a preset as .mplstyle file
357
+ python style_configurator.py --preset publication --output my_style.mplstyle
358
+
359
+ # Interactive mode
360
+ python style_configurator.py --interactive
361
+ """,
362
+ )
363
+ parser.add_argument(
364
+ "--preset",
365
+ type=str,
366
+ choices=list(STYLE_PRESETS.keys()),
367
+ help="Use a predefined style preset",
368
+ )
369
+ parser.add_argument("--output", type=str, help="Save style to .mplstyle file")
370
+ parser.add_argument("--preview", action="store_true", help="Show style preview")
371
+ parser.add_argument("--list", action="store_true", help="List available presets")
372
+ parser.add_argument("--interactive", action="store_true", help="Run in interactive mode")
373
+
374
+ args = parser.parse_args()
375
+
376
+ if args.list:
377
+ list_available_presets()
378
+ # Also show currently available matplotlib styles
379
+ print("\nBuilt-in matplotlib styles:")
380
+ print("-" * 40)
381
+ for style in sorted(plt.style.available):
382
+ print(f" {style}")
383
+ return
384
+
385
+ if args.interactive:
386
+ style_dict = interactive_mode()
387
+ elif args.preset:
388
+ style_dict = STYLE_PRESETS[args.preset].copy()
389
+ print(f"Using '{args.preset}' preset")
390
+ else:
391
+ print("No preset or interactive mode specified. Showing default preview.")
392
+ style_dict = {}
393
+
394
+ if style_dict:
395
+ print_style_info(style_dict)
396
+
397
+ if args.output:
398
+ save_style_file(style_dict, args.output)
399
+
400
+ if args.preview or args.interactive:
401
+ print("Creating style preview...")
402
+ fig = create_style_preview(style_dict if style_dict else None)
403
+
404
+ if args.output:
405
+ preview_filename = args.output.replace(".mplstyle", "_preview.png")
406
+ plt.savefig(preview_filename, dpi=150, bbox_inches="tight")
407
+ print(f"Preview saved to {preview_filename}")
408
+
409
+ plt.show()
410
+
411
+
412
+ if __name__ == "__main__":
413
+ main()