pdfsys bootstrap commited on
Commit ·
67495fe
0
Parent(s):
chore: bootstrap pdfsys workspace with 7 package skeletons
Browse filesInitial scaffolding only — package layouts, pyproject.toml files,
__init__.py stubs, README, .gitignore, LICENSE, and the existing
docs/PRD.md. No runtime code yet.
Packages:
- pdfsys-core shared dataclasses + manifest + layout cache
- pdfsys-router stage-A (text-ok vs ocr) + stage-B (pipeline vs vlm)
- pdfsys-layout-analyser runs once, caches LayoutDocument
- pdfsys-parser-mupdf text-ok backend
- pdfsys-parser-pipeline needs-ocr + simple layout backend
- pdfsys-parser-vlm needs-ocr + complex layout backend
- pdfsys-bench cross-backend evaluation harness
- .gitignore +35 -0
- LICENSE +17 -0
- README.md +62 -0
- docs/PRD.md +440 -0
- packages/pdfsys-bench/pyproject.toml +18 -0
- packages/pdfsys-bench/src/pdfsys_bench/__init__.py +7 -0
- packages/pdfsys-core/pyproject.toml +13 -0
- packages/pdfsys-core/src/pdfsys_core/__init__.py +7 -0
- packages/pdfsys-core/src/pdfsys_core/cache.py +7 -0
- packages/pdfsys-core/src/pdfsys_core/config.py +4 -0
- packages/pdfsys-core/src/pdfsys_core/layout.py +4 -0
- packages/pdfsys-core/src/pdfsys_core/manifest.py +4 -0
- packages/pdfsys-core/src/pdfsys_core/types.py +4 -0
- packages/pdfsys-layout-analyser/pyproject.toml +18 -0
- packages/pdfsys-layout-analyser/src/pdfsys_layout_analyser/__init__.py +10 -0
- packages/pdfsys-layout-analyser/src/pdfsys_layout_analyser/analyser.py +4 -0
- packages/pdfsys-parser-mupdf/pyproject.toml +18 -0
- packages/pdfsys-parser-mupdf/src/pdfsys_parser_mupdf/__init__.py +8 -0
- packages/pdfsys-parser-mupdf/src/pdfsys_parser_mupdf/extract.py +1 -0
- packages/pdfsys-parser-pipeline/pyproject.toml +18 -0
- packages/pdfsys-parser-pipeline/src/pdfsys_parser_pipeline/__init__.py +9 -0
- packages/pdfsys-parser-pipeline/src/pdfsys_parser_pipeline/extract.py +1 -0
- packages/pdfsys-parser-pipeline/src/pdfsys_parser_pipeline/ocr_engine.py +5 -0
- packages/pdfsys-parser-vlm/pyproject.toml +18 -0
- packages/pdfsys-parser-vlm/src/pdfsys_parser_vlm/__init__.py +8 -0
- packages/pdfsys-parser-vlm/src/pdfsys_parser_vlm/extract.py +1 -0
- packages/pdfsys-router/pyproject.toml +18 -0
- packages/pdfsys-router/src/pdfsys_router/__init__.py +9 -0
- packages/pdfsys-router/src/pdfsys_router/classifier.py +4 -0
- packages/pdfsys-router/src/pdfsys_router/decider.py +4 -0
- pyproject.toml +40 -0
.gitignore
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*.egg-info/
|
| 7 |
+
.eggs/
|
| 8 |
+
build/
|
| 9 |
+
dist/
|
| 10 |
+
|
| 11 |
+
# uv / virtualenv
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
uv.lock
|
| 15 |
+
|
| 16 |
+
# local pipeline scratch
|
| 17 |
+
work/
|
| 18 |
+
output/
|
| 19 |
+
.cache/
|
| 20 |
+
samples/
|
| 21 |
+
bench_data/
|
| 22 |
+
*.layout.json
|
| 23 |
+
|
| 24 |
+
# models / weights (too big for git)
|
| 25 |
+
models/
|
| 26 |
+
*.onnx
|
| 27 |
+
*.safetensors
|
| 28 |
+
*.bin
|
| 29 |
+
*.pt
|
| 30 |
+
|
| 31 |
+
# OS / editor
|
| 32 |
+
.DS_Store
|
| 33 |
+
.idea/
|
| 34 |
+
.vscode/
|
| 35 |
+
*.swp
|
LICENSE
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
Copyright 2026 MNBVC Contributors
|
| 6 |
+
|
| 7 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 8 |
+
you may not use this file except in compliance with the License.
|
| 9 |
+
You may obtain a copy of the License at
|
| 10 |
+
|
| 11 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 12 |
+
|
| 13 |
+
Unless required by applicable law or agreed to in writing, software
|
| 14 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 15 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 16 |
+
See the License for the specific language governing permissions and
|
| 17 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pdfsys-mnbvc
|
| 2 |
+
|
| 3 |
+
PB-scale PDF → pretraining-data pipeline for the MNBVC corpus project.
|
| 4 |
+
FinePDFs-inspired architecture adapted for Chinese-heavy, mixed-quality input.
|
| 5 |
+
|
| 6 |
+
## Architecture
|
| 7 |
+
|
| 8 |
+
Two-stage routing, cascaded:
|
| 9 |
+
|
| 10 |
+
```
|
| 11 |
+
┌──────────────┐
|
| 12 |
+
PDF ─► │ pdfsys-router│ stage A (cheap classifier)
|
| 13 |
+
└──────┬───────┘
|
| 14 |
+
│
|
| 15 |
+
text-ok ◄──┴──► needs-ocr
|
| 16 |
+
│ │
|
| 17 |
+
▼ ▼
|
| 18 |
+
parser-mupdf pdfsys-layout-analyser (runs once, caches LayoutDocument)
|
| 19 |
+
│
|
| 20 |
+
▼
|
| 21 |
+
stage B decision
|
| 22 |
+
│
|
| 23 |
+
no-complex ◄───┴───► complex (tables / formulas)
|
| 24 |
+
│ │
|
| 25 |
+
▼ ▼
|
| 26 |
+
parser-pipeline parser-vlm
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
The `LayoutDocument` produced by `pdfsys-layout-analyser` is cached to disk
|
| 30 |
+
and consumed by **both** the stage-B decision in `pdfsys-router` **and** the
|
| 31 |
+
downstream parser backend — layout inference runs at most once per PDF.
|
| 32 |
+
|
| 33 |
+
## Workspace packages
|
| 34 |
+
|
| 35 |
+
| Package | Role |
|
| 36 |
+
|---|---|
|
| 37 |
+
| `pdfsys-core` | Shared dataclasses (`PdfRecord`, `LayoutDocument`), manifest IO, layout cache. No PDF/ML deps. |
|
| 38 |
+
| `pdfsys-router` | Two-stage router. Stage A text-ok/needs-ocr; Stage B pipeline/vlm from cached layout. |
|
| 39 |
+
| `pdfsys-layout-analyser` | Page layout model runner (PP-DocLayoutV3 / docling-layout-heron). Runs once, writes cache. |
|
| 40 |
+
| `pdfsys-parser-mupdf` | Text-ok backend. PyMuPDF + reading order → Markdown. |
|
| 41 |
+
| `pdfsys-parser-pipeline` | Needs-ocr + simple layout backend. Region-level OCR (RapidOCR / PaddleOCR-classic). |
|
| 42 |
+
| `pdfsys-parser-vlm` | Needs-ocr + complex layout backend. MinerU 2.5 / PaddleOCR-VL on complex regions. |
|
| 43 |
+
| `pdfsys-bench` | Cross-backend throughput / latency / F1 evaluation. |
|
| 44 |
+
|
| 45 |
+
## Setup (macOS)
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
# Requires uv >= 0.4
|
| 49 |
+
uv sync
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
Running a single PDF through the pipeline, and orchestration above the
|
| 53 |
+
extraction core (ingest / dedup / quality / tokenize) are not implemented
|
| 54 |
+
yet — see `docs/PRD.md` for the full design.
|
| 55 |
+
|
| 56 |
+
## Docs
|
| 57 |
+
|
| 58 |
+
- `docs/PRD.md` — full PRD with resource budgets and roadmap.
|
| 59 |
+
|
| 60 |
+
## License
|
| 61 |
+
|
| 62 |
+
Apache-2.0
|
docs/PRD.md
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDFSystem-MNBVC · PRD
|
| 2 |
+
|
| 3 |
+
> PB 级 PDF → 预训练数据处理系统
|
| 4 |
+
> 对标 HuggingFace FinePDFs · 面向 MNBVC 中文语料
|
| 5 |
+
> v0.1 · 2026-04-11
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 0. TL;DR
|
| 10 |
+
|
| 11 |
+
构建一套可在 10–100 GPU 小集群上长期稳定运行的 PDF → 预训练数据处理系统,将 PB 级原始 PDF 高质量、高吞吐、低成本地转换为结构化中文预训练数据。
|
| 12 |
+
|
| 13 |
+
关键取舍:
|
| 14 |
+
|
| 15 |
+
- **双路径 + 前置分流**:用 CPU 上 ≤10 ms 的 XGBoost 路由器把 ~90% 的页面送入 CPU 文本路径(PyMuPDF),只让 ~10% 的页面看到 GPU(借鉴 FinePDFs)。
|
| 16 |
+
- **小模型够用**:GPU 路径主选 **MinerU 2.5-Pro 1.2B**(中文字符 F1 0.965,A100 2.12 fps),备选 **PaddleOCR-VL 0.9B**(吞吐快 15.8%)。不调用任何商业 API。
|
| 17 |
+
- **编排复用**:直接使用 `datatrove`(FinePDFs 同款),省去自研编排器 80% 的工作量。
|
| 18 |
+
- **资源反推**:1 PB 原始 PDF ≈ 5 亿文档 ≈ 100 亿页,在 100 × A100 + 32 节点 CPU 集群下全量处理墙钟约 2 个月。
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## 1. 背景与目标
|
| 23 |
+
|
| 24 |
+
MNBVC 是一个长期收集中文语料的开源项目,累积的原始 PDF 已逼近 PB 量级且仍在增长。这些 PDF 覆盖学术论文、政府公文、行业报告、电子书、扫描古籍、报纸期刊等极其异质的来源,是中文大模型预训练数据中一座尚未被充分开采的金矿。
|
| 25 |
+
|
| 26 |
+
本项目目标是把这些 PDF 转成可直接用于大模型预训练的结构化文本,并保持完整的可复现性与可追溯性。设计上重度借鉴 HuggingFace 2025 年开源的 FinePDFs(475M 文档、1733 语种、3T tokens)的工程经验,同时针对中文语料、有限算力与本地部署做关键取舍。
|
| 27 |
+
|
| 28 |
+
### 1.1 关键非功能目标
|
| 29 |
+
|
| 30 |
+
| 维度 | 目标 |
|
| 31 |
+
| --- | --- |
|
| 32 |
+
| 吞吐 | 10–100 GPU 集群下,单月可消化 50–500 TB 原始 PDF |
|
| 33 |
+
| 成本 | GPU 成本占比 ≤ 35%,其余由 CPU 路径承担 |
|
| 34 |
+
| 质量 | 中文 OCR 字符 F1 ≥ 0.95;阅读顺序还原正确率 ≥ 0.90 |
|
| 35 |
+
| 可复现 | 任意 shard 可独立重跑且产物字节级一致 |
|
| 36 |
+
| 断点续跑 | 允许任意节点失败,整体进度不丢 |
|
| 37 |
+
| 合规 | PII 自动脱敏、来源可追溯、license 元数据完整保留 |
|
| 38 |
+
|
| 39 |
+
### 1.2 非目标
|
| 40 |
+
|
| 41 |
+
- 不做多模态数据抽取(图像、图表作为语义单元进入训练数据),这是未来版本的事。
|
| 42 |
+
- 不做实时处理,整体是 batch pipeline。
|
| 43 |
+
- 不做训练侧的数据混合策略,那是下游训练框架的职责。
|
| 44 |
+
|
| 45 |
+
---
|
| 46 |
+
|
| 47 |
+
## 2. 关键设计洞察
|
| 48 |
+
|
| 49 |
+
在画架构图之前,先把 FinePDFs 等先行项目踩过的坑提炼为六条贯穿全局的设计原则。后续每一个模块的取舍都要回到这六条做合理性检验。
|
| 50 |
+
|
| 51 |
+
### 2.1 分流先行:90% 的页面不该看到 GPU
|
| 52 |
+
|
| 53 |
+
FinePDFs 团队最重要的工程发现是:一份普通的 PDF 语料里,绝大多数页面其实是 born-digital 的,只有约 5–10% 是扫描件或文本流损坏的页面。如果用统一的 GPU OCR 流水线处理全部页面,整个项目的 GPU 成本会比理论下限高 10–20 倍。本系统沿用 FinePDFs 的 XGBoost OCR 路由器思路,这是整个成本模型的命门。
|
| 54 |
+
|
| 55 |
+
### 2.2 小模型已经够用:拒绝商业 API 与百亿 VLM
|
| 56 |
+
|
| 57 |
+
2025 H2–2026 Q1 的开源社区里,MinerU 2.5-Pro 1.2B(OpenDataLab)和 PaddleOCR-VL 0.9B(百度)这两个亚 1.5B 参数的解耦式 VLM 已经在 OmniDocBench 上全面超越 Gemini 2.5 Pro 等闭源大模型,尤其是 MinerU 2.5 在中文文档上拿下 0.965 的字符 F1。一张 A100 80G 可以并行驻留 16 个以上的 1.2B 模型副本。
|
| 58 |
+
|
| 59 |
+
### 2.3 CPU 与 GPU 必须分车道
|
| 60 |
+
|
| 61 |
+
PyMuPDF 在单核上的吞吐是 10–30 PDF/秒,MinerU 2.5 在单卡 A100 上是 2.12 页/秒。两者吞吐差三个数量级。如果共享同一个调度器和队列,慢的一端会立刻变成快的一端的瓶颈(典型 head-of-line blocking)。系统为 CPU 与 GPU 各开独立通道,中间用对象存储 staging 解耦,反压机制让快通道根据慢通道水位自动节流。
|
| 62 |
+
|
| 63 |
+
### 2.4 以页为最小调度单位,文档只是聚合视图
|
| 64 |
+
|
| 65 |
+
一份 PDF 内部的页面同质性远低于直觉。一篇论文常常前 10 页 born-digital、最后附录的扫描表格是图像。如果以"文档"为最小单位,会强迫整篇文档走最重的那条路径。本系统以"页"为最小处理单元、以"文档"为最终聚合单元。
|
| 66 |
+
|
| 67 |
+
### 2.5 廉价过滤前置:不要对垃圾页跑 OCR
|
| 68 |
+
|
| 69 |
+
语种识别、长度过滤、模板化页面(页眉页脚、版权页、空白页)这些动作都能在 CPU 路径里以毫秒级成本完成。把它们前置在 OCR 之前,可以再砍掉 20–40% 的 GPU 工作量。
|
| 70 |
+
|
| 71 |
+
### 2.6 idempotent + checkpoint:失败是常态
|
| 72 |
+
|
| 73 |
+
在 100 GPU × 数月的时间窗口里,节点失败、网络抖动、显存 OOM、模型 NaN、对象存储限流都是必然事件。每个 stage 都设计为幂等(同一输入 → 同一输出)+ shard 级 manifest checkpoint,任意时刻杀掉所有 worker 重启都能从断点继续。
|
| 74 |
+
|
| 75 |
+
---
|
| 76 |
+
|
| 77 |
+
## 3. 总体架构
|
| 78 |
+
|
| 79 |
+
6 个串行 stage + 3 层数据存储。串行只是逻辑视图——实际上每个 stage 内部有大量并行 worker,相邻 stage 之间通过对象存储解耦,可以异步推进。
|
| 80 |
+
|
| 81 |
+
### 3.1 数据流
|
| 82 |
+
|
| 83 |
+
```text
|
| 84 |
+
原始 PDF 对象存储
|
| 85 |
+
│
|
| 86 |
+
▼
|
| 87 |
+
[Stage 0] Ingestion & Sharding ── manifest.parquet (sha256, size, src)
|
| 88 |
+
│
|
| 89 |
+
▼
|
| 90 |
+
[Stage 1] Triage Classifier (CPU, XGBoost)
|
| 91 |
+
│
|
| 92 |
+
├── TEXT_OK ──▶ [Stage 2A] CPU 文本路径
|
| 93 |
+
│ PyMuPDF + 轻量 layout
|
| 94 |
+
│ │
|
| 95 |
+
│ ▼
|
| 96 |
+
│ Markdown + meta
|
| 97 |
+
│
|
| 98 |
+
├── NEEDS_OCR ─▶ [Stage 2B] GPU 视觉路径
|
| 99 |
+
│ PP-DocLayoutV3 + MinerU 2.5 / PaddleOCR-VL
|
| 100 |
+
│ │
|
| 101 |
+
│ ▼
|
| 102 |
+
│ Markdown + meta
|
| 103 |
+
│
|
| 104 |
+
└── REJECT ───▶ Quarantine bucket(人工/重训)
|
| 105 |
+
│
|
| 106 |
+
▼
|
| 107 |
+
[Stage 3] Postprocess: 阅读顺序、跨页合并、公式表格归一
|
| 108 |
+
│
|
| 109 |
+
▼
|
| 110 |
+
[Stage 4] Quality / Lang / PII / Dedup(精确 + MinHash)
|
| 111 |
+
│
|
| 112 |
+
▼
|
| 113 |
+
[Stage 5] 输出打包:Parquet shards + JSONL + Markdown 抽样
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### 3.2 三层存储
|
| 117 |
+
|
| 118 |
+
- **L0 原始层(cold)**:S3/OSS/MinIO,PB 级,原始 PDF 不可变,按 sha256 前缀分目录。
|
| 119 |
+
- **L1 中间层(warm)**:对象存储 + Parquet/JSONL 分片,存放每个 stage 的中间产物。L1 设计为可丢弃——任意时刻清空都能用上游重建。
|
| 120 |
+
- **L2 输出层(hot)**:最终 Parquet 数据集,按语种 / 来源 / 质量分桶,供训练框架和 HuggingFace datasets 直接消费。
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
## 4. 模块详解
|
| 125 |
+
|
| 126 |
+
### 4.1 Stage 0:数据接入与切片
|
| 127 |
+
|
| 128 |
+
入库时流式扫描计算 sha256、大小、来源 URL、首个 PDF Producer 字段,写入 `manifest.parquet`。这个 manifest 是后续所有 stage 的唯一 source of truth。
|
| 129 |
+
|
| 130 |
+
- **Sharding**:按 sha256 前两位 hex 切成 256 个 shard(再细可到 1024),每 shard 200 万–500 万个 PDF,单 worker 处理粒度可控。
|
| 131 |
+
- **前置精确去重**:同 sha256 只保留一条。
|
| 132 |
+
- **PDF 健康检查**:用 PyMuPDF 尝试 open + pages,捕获损坏文件并打 tag,避免后续 worker 反复崩。
|
| 133 |
+
|
| 134 |
+
### 4.2 Stage 1:PDF 分流分类器
|
| 135 |
+
|
| 136 |
+
整个系统省钱最重要的模块,直接复刻 FinePDFs 的 OCR Predictor 思路。XGBoost 是有意为之的选择——纯 CPU 推理 ≤10 ms/PDF,模型几 MB,部署零负担。
|
| 137 |
+
|
| 138 |
+
**特征**(CPU 提取,全部来自 PyMuPDF):
|
| 139 |
+
|
| 140 |
+
- 内嵌文本字节数 / PDF 总字节数(关键比值,扫描件接近 0)
|
| 141 |
+
- 页面数、平均页面像素面积、是否含字体子集
|
| 142 |
+
- 图像对象总面积 / 页面总面积
|
| 143 |
+
- ToUnicode CMap 缺失率(中文古籍 / 老 PDF 的关键信号)
|
| 144 |
+
- 第一页、中间页、最后一页可提取文本长度三元组
|
| 145 |
+
- PDF Producer / Creator 字段(Word / LaTeX / 扫描软件 / Office)
|
| 146 |
+
- XObject 数量与 Form XObject 占比
|
| 147 |
+
|
| 148 |
+
**输出 3 类标签**:
|
| 149 |
+
|
| 150 |
+
- `TEXT_OK` → 走 Stage 2A
|
| 151 |
+
- `NEEDS_OCR` → 走 Stage 2B
|
| 152 |
+
- `REJECT` → 损坏 / 加密 / 0 页 / 全空白,进 quarantine
|
| 153 |
+
|
| 154 |
+
训练数据:5–10 万份人工标注 + 启发式弱标注。目标精确率 ≥ 95%、召回率 ≥ 90%。误判 TEXT_OK 但实际抽不出文字的样本会经 Stage 2A 的失败回退送回 Stage 2B。
|
| 155 |
+
|
| 156 |
+
### 4.3 Stage 2A:CPU 文本路径
|
| 157 |
+
|
| 158 |
+
目标是用最少的 CPU 时间把可提取文本流的 PDF 转成结构良好的 Markdown。FinePDFs 在这一路径上用的是 Docling + Layout Heron int8 量化版;本系统做两点中文本土化调整:
|
| 159 |
+
|
| 160 |
+
- **解析后端**:PyMuPDF 1.27+(基于 MuPDF 1.27.x)。中文 cmap 处理和文本流还原比 Docling 默认后端更稳,每核 10–30 PDF/秒。
|
| 161 |
+
- **轻量布局**:PP-DocLayoutV3 的 ONNX int8 量化版,CPU 推理 ~50 ms/页;triage 非常干净时可 fallback 到纯启发式(bbox 列数聚类 + 字号聚类)。
|
| 162 |
+
- **阅读顺序**:双栏检测 + 段落合并 + 跨页折行还原。
|
| 163 |
+
- **失败回退**:若提取出的字符数小于阈值(如 < 0.3 × 期望字符数),自动改写 manifest 把该 PDF 丢回 Stage 2B。这是 triage 误差的安全网。
|
| 164 |
+
|
| 165 |
+
### 4.4 Stage 2B:GPU 视觉路径
|
| 166 |
+
|
| 167 |
+
处理真正难啃的部分:扫描件、图像 PDF、文本流损坏、版式极端的页面。
|
| 168 |
+
|
| 169 |
+
**主选:MinerU 2.5-Pro 1.2B**
|
| 170 |
+
|
| 171 |
+
- 中文字符 F1 0.965(OmniDocBench 中文 SOTA)
|
| 172 |
+
- 解耦式 VLM:先布局再 patch-level OCR,对长页友好
|
| 173 |
+
- 吞吐:A100 80G vLLM async 2.12 页/s;H200 4.47 页/s
|
| 174 |
+
- 端到端覆盖文本 / 表格 / 公式 / 图像区,输出结构化 JSON → Markdown
|
| 175 |
+
- 显存占用 fp16 ≈ 3 GB,A100 80G 可并行 16+ 副本
|
| 176 |
+
|
| 177 |
+
**备选 / 混部:PaddleOCR-VL 0.9B**
|
| 178 |
+
|
| 179 |
+
- 吞吐比 MinerU 2.5 高 15.8%,显存比 dots.ocr 省 40%
|
| 180 |
+
- OmniDocBench v1.5 总分 92.56(高于 MinerU 2.5 的 90.67)
|
| 181 |
+
- 中文略弱于 MinerU 2.5,但在多语种与吞吐敏感场景上更好
|
| 182 |
+
|
| 183 |
+
**调度策略**:路由器在 Stage 1 输出之外再附一个二级 hint——主语种为中文且含较多公式表格 → MinerU 2.5;其它 → PaddleOCR-VL。两套模型共享同一 worker pool,只是加载不同权重。
|
| 184 |
+
|
| 185 |
+
**推理引擎与批处理**:
|
| 186 |
+
|
| 187 |
+
- 生产环境用 **LMDeploy**(FinePDFs 同款,比 vLLM 省显存、首 token 延迟更低)
|
| 188 |
+
- 动态 batching:max batch 16、max seq 8192、超长页强制切块
|
| 189 |
+
- 常驻模型:worker 一次加载、长生命周期
|
| 190 |
+
- 失败兜底:单页 OOM 自动降 batch 重试 ≤ 2 次后写 quarantine
|
| 191 |
+
|
| 192 |
+
### 4.5 Stage 3:后处理
|
| 193 |
+
|
| 194 |
+
无论来自 CPU 还是 GPU 路径,都进入统一的后处理流水线:
|
| 195 |
+
|
| 196 |
+
- 阅读顺序最终重排(跨页表格合并、脚注挂回正文、双栏交错修正)
|
| 197 |
+
- 段落合并(基于行尾标点与中文断句规则修复折行)
|
| 198 |
+
- 公式归一(LaTeX 用 KaTeX / MathJax 解析校验,失败的退化为图像占位)
|
| 199 |
+
- 表格归一(HTML / Markdown 双格式存储,行列校验失败的转 image-placeholder)
|
| 200 |
+
- Unicode 归一(NFC + 全半角统一 + 控制字符剔除 + 零宽字符清理)
|
| 201 |
+
- 元数据补全(每段记录来源页码、bbox、置信度)
|
| 202 |
+
|
| 203 |
+
### 4.6 Stage 4:质量过滤、语种、PII、去重
|
| 204 |
+
|
| 205 |
+
#### 4.6.1 语种识别
|
| 206 |
+
|
| 207 |
+
GlotLID(FinePDFs 同款),**段落级**而非文档级——一篇中英混排的论文里,参考文献段打 en,正文打 zh,下游可以分别处理。
|
| 208 |
+
|
| 209 |
+
#### 4.6.2 启发式质量过滤
|
| 210 |
+
|
| 211 |
+
- 重复 n-gram 比例(去除 OCR 串行错位产物)
|
| 212 |
+
- 非 CJK / 非 ASCII 符号比例(去除符号噪声)
|
| 213 |
+
- 行长方差与平均行长(去除 OCR 抖动)
|
| 214 |
+
- URL / email 占比、纯数字占比(去除目录页 / 广告页)
|
| 215 |
+
- 最短文档长度阈值(按语种自适应:zh ≥ 200 chars,en ≥ 500 chars)
|
| 216 |
+
|
| 217 |
+
#### 4.6.3 模型质量分类器
|
| 218 |
+
|
| 219 |
+
训练中文版 EduScore:fastText 起步 → DeBERTa-v3-tiny 升级。训练数据用高质量中文教育/百科语料 vs 论坛灌水/SEO 文本做对比。每段打 0–5 分,下游训练时按分桶 mix。
|
| 220 |
+
|
| 221 |
+
#### 4.6.4 PII 脱敏
|
| 222 |
+
|
| 223 |
+
- 正则:身份证 18 位、手机号、银行卡(Luhn 校验)、邮箱、IPv4/IPv6
|
| 224 |
+
- 中国特化:车牌号、统一社会信用代码、护照号
|
| 225 |
+
- 命名实体兜底:MiniLM / BERT-tiny NER,仅在正则未命中时启用
|
| 226 |
+
- 策略:替换为 `⟨PII:phone⟩` 等占位符,原值哈希存入审计表(不入训练数据)
|
| 227 |
+
|
| 228 |
+
#### 4.6.5 去重
|
| 229 |
+
|
| 230 |
+
- 第一遍精确去重:sha256(Stage 0 完成)
|
| 231 |
+
- 第二遍内容精确去重:normalize 后文本的 md5
|
| 232 |
+
- 第三遍模糊去重:MinHash LSH(5-gram、num_hashes=128、threshold 0.85),`datatrove` 的 minhash block 可直接复用
|
| 233 |
+
- **跨 shard 是全局 shuffle 唯一点**:需要一个独立 pass,是整个 pipeline 里最昂贵的单个 stage
|
| 234 |
+
|
| 235 |
+
### 4.7 Stage 5:输出打包
|
| 236 |
+
|
| 237 |
+
最终对外的数据集采用 **Parquet 主格式 + JSONL 副格式 + Markdown 抽样存档**三件套。
|
| 238 |
+
|
| 239 |
+
- Parquet 分片:~1 GB / shard,按 `lang / source / quality_bucket` 分桶
|
| 240 |
+
- schema:`id, lang, source, text_md, text_plain, meta(json), quality_score, dedup_cluster_id, pii_redacted(bool)`
|
| 241 |
+
- 命名约定:`pdfsystem_mnbvc/v1/lang=zh/source=arxiv/qb=high/shard-00001.parquet`
|
| 242 |
+
- JSONL:与 Parquet 1:1 镜像,便于 grep / 抽样审计
|
| 243 |
+
- Markdown 抽样存档:每 shard 随机抽 0.1% 文档落盘原始 Markdown,长期保留作为人工审核基线
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
## 5. 编排与资源调度
|
| 248 |
+
|
| 249 |
+
编排框架直接选用 **datatrove**(FinePDFs 同款),它原生提供 Slurm 后端、shard 级 manifest checkpoint、minhash block 等关键能力。集群层面支持 Slurm 与 Kubernetes 双后端。
|
| 250 |
+
|
| 251 |
+
### 5.1 队列拓扑
|
| 252 |
+
|
| 253 |
+
- **Lane A(CPU)**:节点级数据并行,每节点 64–128 worker;节点本地 NVMe 做热数据缓存;产出写入 L1。
|
| 254 |
+
- **Lane B(GPU)**:每张 GPU 一个 worker 进程,模型常驻;输入来自 Stage 1 直接喂入或 Lane A 的失败回退。
|
| 255 |
+
- **Lane C(global ops)**:MinHash dedup、跨 shard 合并这类 shuffle 操作单独排队,避开 A/B 的高吞吐节奏。
|
| 256 |
+
|
| 257 |
+
### 5.2 反压与水位
|
| 258 |
+
|
| 259 |
+
Lane A 吞吐远高于 Lane B。如果不加控制,Lane B 的 staging 队列会无限增长。系统在 staging 桶上设置 **high / low watermark**:超过 high watermark 时 Lane A worker 主动 sleep,掉到 low watermark 再恢复。简单可靠,避免引入复杂的消息中间件。
|
| 260 |
+
|
| 261 |
+
### 5.3 Checkpoint 与断点续跑
|
| 262 |
+
|
| 263 |
+
- 每个 shard 完成后写一条 manifest record:`{shard_id, stage, status, output_path, n_docs, n_tokens, sha256}`
|
| 264 |
+
- worker 启动时先扫 manifest,跳过已完成的 shard
|
| 265 |
+
- 中间层产物本身就是 checkpoint,允许某个 stage 部分失败后只重跑该 stage
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## 6. 资源预算(PB 级反推)
|
| 270 |
+
|
| 271 |
+
以 1 PB 原始 PDF 为单位,参数取自 FinePDFs 公开数据与 MinerU 2.5 / PaddleOCR-VL 公开 benchmark。数量级估算,用于反推集群规模而非签 SLA。
|
| 272 |
+
|
| 273 |
+
### 6.1 数据量估算
|
| 274 |
+
|
| 275 |
+
| 指标 | 假设值 | 推导 |
|
| 276 |
+
| --- | --- | --- |
|
| 277 |
+
| 原始数据 | 1 PB | 目标输入 |
|
| 278 |
+
| 平均 PDF 大小 | 2 MB | MNBVC 抽样观测 |
|
| 279 |
+
| PDF 总数 | ≈ 5 × 10⁸ | 1 PB ÷ 2 MB |
|
| 280 |
+
| 平均页数 / PDF | 20 | 学术 + 报告混合 |
|
| 281 |
+
| 页面总数 | ≈ 1.0 × 10¹⁰ | 5e8 × 20 |
|
| 282 |
+
| 分流比例 | 90% CPU / 10% GPU | FinePDFs 经验,中文略偏 GPU |
|
| 283 |
+
| CPU 路径页数 | 9 × 10⁹ | |
|
| 284 |
+
| GPU 路径页数 | 1 × 10⁹ | |
|
| 285 |
+
|
| 286 |
+
### 6.2 CPU 路径预算
|
| 287 |
+
|
| 288 |
+
| 项 | 数值 | 说明 |
|
| 289 |
+
| --- | --- | --- |
|
| 290 |
+
| PyMuPDF 吞吐 | ~30 页/s/core | 现代数字 PDF,单核 |
|
| 291 |
+
| 所需 core·秒 | 3.0 × 10⁸ | 9e9 ÷ 30 |
|
| 292 |
+
| 所需 core·小时 | ≈ 8.3 × 10⁴ | |
|
| 293 |
+
| 32 节点 × 64 core | 2048 core 并行 | |
|
| 294 |
+
| CPU 路径墙钟 | ≈ 40 小时纯计算 | 不含 IO |
|
| 295 |
+
| 现实墙钟 | 1–2 周 | 含对象存储与 manifest 开销 |
|
| 296 |
+
|
| 297 |
+
### 6.3 GPU 路径预算
|
| 298 |
+
|
| 299 |
+
| 项 | 数值 | 说明 |
|
| 300 |
+
| --- | --- | --- |
|
| 301 |
+
| MinerU 2.5 吞吐 | ~2 页/s/A100 | 公开 2.12 fps |
|
| 302 |
+
| 所需 GPU·秒 | 5 × 10⁸ | 1e9 ÷ 2 |
|
| 303 |
+
| 所需 GPU·小时 | ≈ 1.39 × 10⁵ | |
|
| 304 |
+
| 50 × A100 满载 | ~115 天 | 理想吞吐 |
|
| 305 |
+
| 100 × A100 满载 | ~58 天 | 推荐配置 |
|
| 306 |
+
| 100 × H200 满载 | ~26 天 | 高端配置 |
|
| 307 |
+
|
| 308 |
+
**结论**:100 A100 + 32 节点 CPU 规模下,1 PB 原始 PDF 全量处理墙钟约 2 个月;100 H200 可压到 1 个月以内。与 FinePDFs 团队公开的处理周期数量级一致。
|
| 309 |
+
|
| 310 |
+
### 6.4 存储预算
|
| 311 |
+
|
| 312 |
+
- L0 原始:1 PB(不可压缩)
|
| 313 |
+
- L1 中间:~30 TB(每 PDF ~60 KB Markdown + meta)
|
| 314 |
+
- L2 输出 Parquet:~15 TB(zstd 压缩)
|
| 315 |
+
- Manifest + 索引:~50 GB
|
| 316 |
+
|
| 317 |
+
---
|
| 318 |
+
|
| 319 |
+
## 7. 存储与数据布局
|
| 320 |
+
|
| 321 |
+
```text
|
| 322 |
+
s3://pdfsystem-mnbvc/
|
| 323 |
+
├── L0_raw/ # 原始 PDF,不可变
|
| 324 |
+
│ └── ab/cd/abcd1234....pdf # 按 sha256 前 4 位分目录
|
| 325 |
+
├── L1_intermediate/
|
| 326 |
+
│ ├── stage1_triage/ # XGBoost 路由结果
|
| 327 |
+
│ │ └── shard-00001.parquet
|
| 328 |
+
│ ├── stage2a_text/ # CPU 路径产物
|
| 329 |
+
│ ├── stage2b_vision/ # GPU 路径产物
|
| 330 |
+
│ ├── stage3_postproc/
|
| 331 |
+
│ └── stage4_quality/
|
| 332 |
+
├── L2_output/
|
| 333 |
+
│ └── v1/lang=zh/source=arxiv/qb=high/shard-00001.parquet
|
| 334 |
+
├── manifest/
|
| 335 |
+
│ ├── ingest.parquet # Stage 0 写入
|
| 336 |
+
│ └── stage_status.parquet # 全 stage 状态
|
| 337 |
+
└── audit/
|
| 338 |
+
├── pii_hash_table.parquet # 不可逆审计
|
| 339 |
+
└── md_samples/ # 0.1% 抽样 Markdown
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
---
|
| 343 |
+
|
| 344 |
+
## 8. 可观测、容错、质量保证
|
| 345 |
+
|
| 346 |
+
### 8.1 指标
|
| 347 |
+
|
| 348 |
+
- 吞吐:每 stage 的 docs/s、pages/s、bytes/s(shard × worker 维度)
|
| 349 |
+
- 路由分布:`TEXT_OK / NEEDS_OCR / REJECT` 三类比例随时间变化
|
| 350 |
+
- 回退率:Stage 2A → 2B 回退占比(拐点立即告警)
|
| 351 |
+
- GPU 利用率:SM busy %、显存占用、batch 平均长度
|
| 352 |
+
- 失败率:按 stage 与失败原因分类计数
|
| 353 |
+
- 成本:估算的 GPU·小时 / TB 原始数据
|
| 354 |
+
|
| 355 |
+
### 8.2 质量回归
|
| 356 |
+
|
| 357 |
+
维护一份 **500 份手工对齐的中文 PDF 基准集**(学术 / 报告 / 扫描古籍 / 报纸 / 双栏论文各 100 份),每天对当天 pipeline 输出做一次自动 diff,超阈值触发人工复核。这是防止"悄悄变烂"的最重要防线。
|
| 358 |
+
|
| 359 |
+
### 8.3 故障策略
|
| 360 |
+
|
| 361 |
+
| 粒度 | 策略 |
|
| 362 |
+
| --- | --- |
|
| 363 |
+
| worker | 单页失败 retry 3 次,仍失败写 quarantine |
|
| 364 |
+
| 节点 | 心跳超时由调度器自动重排 |
|
| 365 |
+
| shard | manifest 标记 failed,人工审视后决定重跑或丢弃 |
|
| 366 |
+
| 全局 | 每周回看 quarantine 桶,按错误聚类决定是否升级 triage 分类器 |
|
| 367 |
+
|
| 368 |
+
---
|
| 369 |
+
|
| 370 |
+
## 9. 参考方案对比
|
| 371 |
+
|
| 372 |
+
| 方案 | 类型 | 中文 | 吞吐 | License | 本系统位置 |
|
| 373 |
+
| --- | --- | --- | --- | --- | --- |
|
| 374 |
+
| PyMuPDF / MuPDF | CPU 文本 | 好 | 10–30 PDF/s/core | AGPL | Stage 2A 主力 |
|
| 375 |
+
| Docling Heron int8 | CPU 布局 | 中 | 依赖 OpenVINO | MIT | 可选 |
|
| 376 |
+
| PP-DocLayoutV3 | 布局检测 | 好 | CPU/GPU 均可 | Apache 2.0 | Stage 2B 布局头 |
|
| 377 |
+
| **MinerU 2.5-Pro 1.2B** | VLM 端到端 | **极佳** | 2.12 fps@A100 | Apache 2.0 | **Stage 2B 主选** |
|
| 378 |
+
| PaddleOCR-VL 0.9B | VLM 端到端 | 好 | 比 MinerU 快 15.8% | Apache 2.0 | Stage 2B 备选 |
|
| 379 |
+
| RolmOCR | OCR | 中 | 需 vLLM / LMDeploy | Apache 2.0 | FinePDFs 原选,不用 |
|
| 380 |
+
| olmOCR | OCR | 中 | — | Apache 2.0 | baseline |
|
| 381 |
+
| Gemini 2.5 Pro | 闭源 API | 好 | API 限速 | 商业 | 不采用(成本 / 合规) |
|
| 382 |
+
|
| 383 |
+
---
|
| 384 |
+
|
| 385 |
+
## 10. 实施路线图
|
| 386 |
+
|
| 387 |
+
### P0 · 单机 PoC(2 周)
|
| 388 |
+
|
| 389 |
+
- 1 万份多样化 PDF 走通端到端 6 个 stage
|
| 390 |
+
- MinerU 2.5 + PyMuPDF 双路径独立验证
|
| 391 |
+
- 产出第一批 Parquet shard,人工对比抽样
|
| 392 |
+
- **交付物**:可运行的 docker compose + Jupyter 验证 notebook
|
| 393 |
+
|
| 394 |
+
### P1 · 分类器与 datatrove 集成(4 周)
|
| 395 |
+
|
| 396 |
+
- 标注 5–10 万份 PDF 训练 XGBoost triage 分类器
|
| 397 |
+
- 接入 `datatrove`,跑通单 shard 的 6-stage Slurm 作业
|
| 398 |
+
- 实现 manifest checkpoint 与回退闭环
|
| 399 |
+
- **交付物**:100 万 PDF 试运行报告 + 成本估算
|
| 400 |
+
|
| 401 |
+
### P2 · 10 GPU 集群试点(6 周)
|
| 402 |
+
|
| 403 |
+
- 1 TB 原始 PDF 全链路压测
|
| 404 |
+
- 调通反压、重跑、quarantine 流程
|
| 405 |
+
- 中文质量回归基准集上线
|
| 406 |
+
- **交付物**:可对外发布的 v0.1 数据集 + 评测报告
|
| 407 |
+
|
| 408 |
+
### P3 · PB 全量与持续迭代(持续)
|
| 409 |
+
|
| 410 |
+
- 100 GPU 满载推进到 PB 量级
|
| 411 |
+
- 按月发布 v0.2、v0.3,每版带 CHANGELOG 与质量 diff
|
| 412 |
+
- 长尾专项:竖排古籍、繁体、表格密集型行业报告
|
| 413 |
+
- 社区贡献:开源 triage 分类器与中文 EduScore
|
| 414 |
+
|
| 415 |
+
---
|
| 416 |
+
|
| 417 |
+
## 11. 风险与开放问题
|
| 418 |
+
|
| 419 |
+
- **中文古籍与竖排**:MinerU 2.5 在横排中文表现极好,但竖排古籍未经专门评测。P3 阶段计划微调一个古籍专用 LoRA。
|
| 420 |
+
- **公式与表格忠实度**:LaTeX 还原失败的样本比例需要持续监控;P2 质量回归基准集要专门覆盖公式表格密集的论文。
|
| 421 |
+
- **数据来源合规**:PB 级语料的来源 license 必须随数据流转保留,输出 Parquet 独立列存储。
|
| 422 |
+
- **MinerU 2.5 商用授权**:Apache 2.0,但需关注 OpenDataLab 后续版本条款,保留 PaddleOCR-VL 作为热备。
|
| 423 |
+
- **PII 召回率**:中文 PII 模式比英文复杂(地址、姓名),正则不够,可能需要小模型 NER 兜底。
|
| 424 |
+
- **对象存储成本**:PB 级数据 + 中间层 + 输出层每月存储与流量费用需在 P0 阶段完成 TCO 估算。
|
| 425 |
+
|
| 426 |
+
---
|
| 427 |
+
|
| 428 |
+
## 附录 A · 参考资料
|
| 429 |
+
|
| 430 |
+
- [FinePDFs 数据集](https://huggingface.co/datasets/HuggingFaceFW/finepdfs)
|
| 431 |
+
- [FinePDFs 博客](https://huggingface.co/spaces/HuggingFaceFW/FinePDFsBlog)
|
| 432 |
+
- [FinePDFs 代码库](https://github.com/huggingface/finepdfs)
|
| 433 |
+
- [MinerU 2.5 Pro 1.2B](https://modelscope.cn/models/OpenDataLab/MinerU2.5-Pro-2604-1.2B)
|
| 434 |
+
- [MinerU 2.5 论文 (arXiv:2509.22186)](https://arxiv.org/abs/2509.22186)
|
| 435 |
+
- [PaddleOCR-VL 论文 (arXiv:2510.14528)](https://arxiv.org/abs/2510.14528)
|
| 436 |
+
- [PP-DocLayoutV3](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3)
|
| 437 |
+
- [MuPDF 1.27 文档](https://mupdf.readthedocs.io/en/1.27.2/)
|
| 438 |
+
- [datatrove](https://github.com/huggingface/datatrove)
|
| 439 |
+
- [MinerU 项目主页](https://github.com/opendatalab/MinerU)
|
| 440 |
+
- [OmniDocBench](https://github.com/opendatalab/OmniDocBench)
|
packages/pdfsys-bench/pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-bench"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Cross-backend benchmarking — throughput, latency, and F1 on a small sample set."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"pdfsys-core",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[tool.uv.sources]
|
| 15 |
+
pdfsys-core = { workspace = true }
|
| 16 |
+
|
| 17 |
+
[tool.hatch.build.targets.wheel]
|
| 18 |
+
packages = ["src/pdfsys_bench"]
|
packages/pdfsys-bench/src/pdfsys_bench/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-bench — evaluation harness.
|
| 2 |
+
|
| 3 |
+
Runs the same sample PDF set through mupdf / pipeline / vlm backends and
|
| 4 |
+
reports throughput, latency, and F1 against gold Markdown references.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__version__ = "0.0.1"
|
packages/pdfsys-core/pyproject.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-core"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Shared data contracts (PdfRecord, LayoutDocument, Manifest) for pdfsys."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = []
|
| 11 |
+
|
| 12 |
+
[tool.hatch.build.targets.wheel]
|
| 13 |
+
packages = ["src/pdfsys_core"]
|
packages/pdfsys-core/src/pdfsys_core/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-core — shared data contracts for the pdfsys pipeline.
|
| 2 |
+
|
| 3 |
+
This package holds only pure-Python dataclasses, enums, configuration, and
|
| 4 |
+
manifest/cache IO. It MUST NOT depend on any PDF, OCR, or ML library.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__version__ = "0.0.1"
|
packages/pdfsys-core/src/pdfsys_core/cache.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LayoutCache — on-disk cache keyed by (sha256, layout_model_version).
|
| 2 |
+
|
| 3 |
+
Ensures layout-analyser runs at most once per PDF; both router (for complex-
|
| 4 |
+
content decisions) and parser-pipeline / parser-vlm read from this cache.
|
| 5 |
+
|
| 6 |
+
Stub only — real implementation lands later.
|
| 7 |
+
"""
|
packages/pdfsys-core/src/pdfsys_core/config.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Runtime configuration dataclasses shared across every pdfsys stage.
|
| 2 |
+
|
| 3 |
+
Stub only — real dataclasses land later.
|
| 4 |
+
"""
|
packages/pdfsys-core/src/pdfsys_core/layout.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LayoutDocument schema — the contract every parser backend consumes.
|
| 2 |
+
|
| 3 |
+
Stubs only — LayoutPage / LayoutRegion / LayoutDocument dataclasses land later.
|
| 4 |
+
"""
|
packages/pdfsys-core/src/pdfsys_core/manifest.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""JSONL-backed manifest for tracking per-PDF state across pipeline stages.
|
| 2 |
+
|
| 3 |
+
Stub only — real implementation lands later.
|
| 4 |
+
"""
|
packages/pdfsys-core/src/pdfsys_core/types.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core record types shared by every pdfsys package.
|
| 2 |
+
|
| 3 |
+
Stubs only — real dataclasses land in a follow-up commit.
|
| 4 |
+
"""
|
packages/pdfsys-layout-analyser/pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-layout-analyser"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Layout model runner: emits LayoutDocument to cache, runs once per PDF on the needs-ocr branch."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"pdfsys-core",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[tool.uv.sources]
|
| 15 |
+
pdfsys-core = { workspace = true }
|
| 16 |
+
|
| 17 |
+
[tool.hatch.build.targets.wheel]
|
| 18 |
+
packages = ["src/pdfsys_layout_analyser"]
|
packages/pdfsys-layout-analyser/src/pdfsys_layout_analyser/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-layout-analyser — page layout detection.
|
| 2 |
+
|
| 3 |
+
Runs exactly once per PDF on the needs-ocr branch; the produced
|
| 4 |
+
LayoutDocument is cached and read by both pdfsys-router (stage-B decision)
|
| 5 |
+
and the downstream parser backends (pipeline / vlm).
|
| 6 |
+
|
| 7 |
+
Model candidates: PP-DocLayoutV3, docling-layout-heron (OpenVINO INT8).
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
__version__ = "0.0.1"
|
packages/pdfsys-layout-analyser/src/pdfsys_layout_analyser/analyser.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Layout analyser entrypoint.
|
| 2 |
+
|
| 3 |
+
Stub only — concrete model loading lands later.
|
| 4 |
+
"""
|
packages/pdfsys-parser-mupdf/pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-parser-mupdf"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Text-ok backend: PyMuPDF extraction + reading order + Markdown emission."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"pdfsys-core",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[tool.uv.sources]
|
| 15 |
+
pdfsys-core = { workspace = true }
|
| 16 |
+
|
| 17 |
+
[tool.hatch.build.targets.wheel]
|
| 18 |
+
packages = ["src/pdfsys_parser_mupdf"]
|
packages/pdfsys-parser-mupdf/src/pdfsys_parser_mupdf/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-parser-mupdf — text-ok extraction backend.
|
| 2 |
+
|
| 3 |
+
Consumes PDFs classified as text-ok by pdfsys-router. Uses PyMuPDF for
|
| 4 |
+
block extraction, simple two-column reading order, and emits Markdown.
|
| 5 |
+
Does NOT depend on pdfsys-layout-analyser.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "0.0.1"
|
packages/pdfsys-parser-mupdf/src/pdfsys_parser_mupdf/extract.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""PyMuPDF extraction entrypoint. Stub only."""
|
packages/pdfsys-parser-pipeline/pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-parser-pipeline"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "OCR pipeline backend: consumes cached LayoutDocument + region-level OCR engine (RapidOCR / PaddleOCR-classic)."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"pdfsys-core",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[tool.uv.sources]
|
| 15 |
+
pdfsys-core = { workspace = true }
|
| 16 |
+
|
| 17 |
+
[tool.hatch.build.targets.wheel]
|
| 18 |
+
packages = ["src/pdfsys_parser_pipeline"]
|
packages/pdfsys-parser-pipeline/src/pdfsys_parser_pipeline/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-parser-pipeline — OCR-pipeline backend.
|
| 2 |
+
|
| 3 |
+
Handles the "needs-ocr AND no complex content" branch. Reads the cached
|
| 4 |
+
LayoutDocument produced by pdfsys-layout-analyser, renders each region via
|
| 5 |
+
PyMuPDF, runs line-level OCR (RapidOCR / PaddleOCR-classic, selectable via
|
| 6 |
+
config), and assembles the Markdown output. CPU-friendly.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
__version__ = "0.0.1"
|
packages/pdfsys-parser-pipeline/src/pdfsys_parser_pipeline/extract.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Region-level OCR assembly entrypoint. Stub only."""
|
packages/pdfsys-parser-pipeline/src/pdfsys_parser_pipeline/ocr_engine.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pluggable line-level OCR engine wrapper (RapidOCR / PaddleOCR-classic).
|
| 2 |
+
|
| 3 |
+
Stub only — if a future parser backend (e.g. vlm fallback) needs to share
|
| 4 |
+
this, promote it to a standalone `pdfsys-ocr-engine` package.
|
| 5 |
+
"""
|
packages/pdfsys-parser-vlm/pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-parser-vlm"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "VLM backend: complex-content pages handed to MinerU 2.5 / PaddleOCR-VL."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"pdfsys-core",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[tool.uv.sources]
|
| 15 |
+
pdfsys-core = { workspace = true }
|
| 16 |
+
|
| 17 |
+
[tool.hatch.build.targets.wheel]
|
| 18 |
+
packages = ["src/pdfsys_parser_vlm"]
|
packages/pdfsys-parser-vlm/src/pdfsys_parser_vlm/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-parser-vlm — VLM extraction backend.
|
| 2 |
+
|
| 3 |
+
Handles the "needs-ocr AND complex content" branch (tables, formulas,
|
| 4 |
+
heavy mixed layouts). Reads the cached LayoutDocument and only routes
|
| 5 |
+
complex regions through a VLM (MinerU 2.5 / PaddleOCR-VL). GPU path.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "0.0.1"
|
packages/pdfsys-parser-vlm/src/pdfsys_parser_vlm/extract.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""VLM extraction entrypoint. Stub only."""
|
packages/pdfsys-router/pyproject.toml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "pdfsys-router"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Stage-1 classifier: decides text-ok vs needs-ocr; consults LayoutCache for complex-content routing."
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"pdfsys-core",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[tool.uv.sources]
|
| 15 |
+
pdfsys-core = { workspace = true }
|
| 16 |
+
|
| 17 |
+
[tool.hatch.build.targets.wheel]
|
| 18 |
+
packages = ["src/pdfsys_router"]
|
packages/pdfsys-router/src/pdfsys_router/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""pdfsys-router — two-stage routing for the pdfsys extraction pipeline.
|
| 2 |
+
|
| 3 |
+
Stage A (cheap): classify text-ok vs needs-ocr from PyMuPDF features.
|
| 4 |
+
Stage B (uses layout cache): for needs-ocr, read the LayoutDocument written
|
| 5 |
+
by pdfsys-layout-analyser and decide pipeline vs vlm based on whether
|
| 6 |
+
complex regions (tables / formulas) exist.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
__version__ = "0.0.1"
|
packages/pdfsys-router/src/pdfsys_router/classifier.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Stage-A classifier: text-ok vs needs-ocr.
|
| 2 |
+
|
| 3 |
+
Stub only.
|
| 4 |
+
"""
|
packages/pdfsys-router/src/pdfsys_router/decider.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Stage-B decider: pipeline vs vlm, driven by LayoutDocument.has_complex_content.
|
| 2 |
+
|
| 3 |
+
Stub only.
|
| 4 |
+
"""
|
pyproject.toml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "pdfsys-mnbvc"
|
| 3 |
+
version = "0.0.1"
|
| 4 |
+
description = "PB-scale PDF -> pretraining-data pipeline (MNBVC edition). Workspace root."
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
license = { text = "Apache-2.0" }
|
| 8 |
+
authors = [{ name = "MNBVC Contributors" }]
|
| 9 |
+
|
| 10 |
+
# Root depends on every workspace package so that `uv sync` at the repo
|
| 11 |
+
# root installs the entire pipeline in one shot.
|
| 12 |
+
dependencies = [
|
| 13 |
+
"pdfsys-core",
|
| 14 |
+
"pdfsys-router",
|
| 15 |
+
"pdfsys-layout-analyser",
|
| 16 |
+
"pdfsys-parser-mupdf",
|
| 17 |
+
"pdfsys-parser-pipeline",
|
| 18 |
+
"pdfsys-parser-vlm",
|
| 19 |
+
"pdfsys-bench",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
[tool.uv.workspace]
|
| 23 |
+
members = ["packages/*"]
|
| 24 |
+
|
| 25 |
+
[tool.uv.sources]
|
| 26 |
+
pdfsys-core = { workspace = true }
|
| 27 |
+
pdfsys-router = { workspace = true }
|
| 28 |
+
pdfsys-layout-analyser = { workspace = true }
|
| 29 |
+
pdfsys-parser-mupdf = { workspace = true }
|
| 30 |
+
pdfsys-parser-pipeline = { workspace = true }
|
| 31 |
+
pdfsys-parser-vlm = { workspace = true }
|
| 32 |
+
pdfsys-bench = { workspace = true }
|
| 33 |
+
|
| 34 |
+
[build-system]
|
| 35 |
+
requires = ["hatchling"]
|
| 36 |
+
build-backend = "hatchling.build"
|
| 37 |
+
|
| 38 |
+
[tool.hatch.build.targets.wheel]
|
| 39 |
+
# Root is a virtual meta-package — no source of its own.
|
| 40 |
+
bypass-selection = true
|