ZaynZhu commited on
Commit
b447602
·
1 Parent(s): 44efbff
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Paper2Video/LICENSE +0 -21
  2. Paper2Video/README-CN.md +0 -248
  3. Paper2Video/README.md +0 -251
  4. Paper2Video/__init__.py +0 -0
  5. Paper2Video/src/__init__.py +0 -0
  6. Paper2Video/src/evaluation/IPMemory/construct.py +0 -69
  7. Paper2Video/src/evaluation/IPMemory/ip_qa.py +0 -142
  8. Paper2Video/src/evaluation/MetaSim_audio.py +0 -102
  9. Paper2Video/src/evaluation/MetaSim_content.py +0 -144
  10. Paper2Video/src/evaluation/PresentArena.py +0 -106
  11. Paper2Video/src/evaluation/PresentQuiz/PresentQuiz.py +0 -264
  12. Paper2Video/src/evaluation/PresentQuiz/create_paper_questions.py +0 -47
  13. Paper2Video/src/evaluation/PresentQuiz/docling/__init__.py +0 -0
  14. Paper2Video/src/evaluation/PresentQuiz/docling/backend/__init__.py +0 -0
  15. Paper2Video/src/evaluation/PresentQuiz/docling/backend/abstract_backend.py +0 -63
  16. Paper2Video/src/evaluation/PresentQuiz/docling/backend/asciidoc_backend.py +0 -430
  17. Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_backend.py +0 -227
  18. Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_v2_backend.py +0 -250
  19. Paper2Video/src/evaluation/PresentQuiz/docling/backend/html_backend.py +0 -442
  20. Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/__init__.py +0 -0
  21. Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/docling_json_backend.py +0 -58
  22. Paper2Video/src/evaluation/PresentQuiz/docling/backend/md_backend.py +0 -428
  23. Paper2Video/src/evaluation/PresentQuiz/docling/backend/msexcel_backend.py +0 -386
  24. Paper2Video/src/evaluation/PresentQuiz/docling/backend/mspowerpoint_backend.py +0 -424
  25. Paper2Video/src/evaluation/PresentQuiz/docling/backend/msword_backend.py +0 -582
  26. Paper2Video/src/evaluation/PresentQuiz/docling/backend/pdf_backend.py +0 -76
  27. Paper2Video/src/evaluation/PresentQuiz/docling/backend/pypdfium2_backend.py +0 -260
  28. Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/__init__.py +0 -0
  29. Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/pubmed_backend.py +0 -592
  30. Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/uspto_backend.py +0 -1888
  31. Paper2Video/src/evaluation/PresentQuiz/docling/chunking/__init__.py +0 -12
  32. Paper2Video/src/evaluation/PresentQuiz/docling/cli/__init__.py +0 -0
  33. Paper2Video/src/evaluation/PresentQuiz/docling/cli/main.py +0 -456
  34. Paper2Video/src/evaluation/PresentQuiz/docling/cli/models.py +0 -107
  35. Paper2Video/src/evaluation/PresentQuiz/docling/cli/tools.py +0 -17
  36. Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/__init__.py +0 -0
  37. Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/base_models.py +0 -258
  38. Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/document.py +0 -394
  39. Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/pipeline_options.py +0 -296
  40. Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/settings.py +0 -67
  41. Paper2Video/src/evaluation/PresentQuiz/docling/document_converter.py +0 -348
  42. Paper2Video/src/evaluation/PresentQuiz/docling/exceptions.py +0 -6
  43. Paper2Video/src/evaluation/PresentQuiz/docling/models/__init__.py +0 -0
  44. Paper2Video/src/evaluation/PresentQuiz/docling/models/base_model.py +0 -87
  45. Paper2Video/src/evaluation/PresentQuiz/docling/models/base_ocr_model.py +0 -189
  46. Paper2Video/src/evaluation/PresentQuiz/docling/models/code_formula_model.py +0 -251
  47. Paper2Video/src/evaluation/PresentQuiz/docling/models/document_picture_classifier.py +0 -190
  48. Paper2Video/src/evaluation/PresentQuiz/docling/models/ds_glm_model.py +0 -386
  49. Paper2Video/src/evaluation/PresentQuiz/docling/models/easyocr_model.py +0 -177
  50. Paper2Video/src/evaluation/PresentQuiz/docling/models/layout_model.py +0 -197
Paper2Video/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 Show Lab
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/README-CN.md DELETED
@@ -1,248 +0,0 @@
1
- # Paper2Video
2
-
3
- <p align="right">
4
- <a href="./README.md">English</a> | <b>简体中文</b>
5
- </p>
6
-
7
-
8
- <p align="center">
9
- <b>Paper2Video: 从学术论文自动生成演讲视频</b>
10
- <br>
11
-
12
-
13
- <p align="center">
14
- <a href="https://zeyu-zhu.github.io/webpage/">Zeyu Zhu*</a>,
15
- <a href="https://qhlin.me/">Kevin Qinghong Lin*</a>,
16
- <a href="https://scholar.google.com/citations?user=h1-3lSoAAAAJ&hl=en">Mike Zheng Shou</a> <br>
17
- 新加坡国立大学 Show Lab
18
- </p>
19
-
20
-
21
- <p align="center">
22
-   <a href="https://arxiv.org/abs/2510.05096">📄 论文</a> &nbsp; | &nbsp;
23
- <a href="https://huggingface.co/papers/2510.05096">🤗 Daily Paper</a> &nbsp; | &nbsp;
24
-   <a href="https://huggingface.co/datasets/ZaynZhu/Paper2Video">📊 数据集</a> &nbsp; | &nbsp;
25
-   <a href="https://showlab.github.io/Paper2Video/">🌐 项目主页</a> &nbsp; | &nbsp;
26
-   <a href="https://x.com/KevinQHLin/status/1976105129146257542">💬 推特</a>
27
- </p>
28
-
29
- - **输入:** 一篇论文 ➕ 一张图像 ➕ 一段音频
30
-
31
- | 论文 | 图像 | 音频 |
32
- |--------|--------|--------|
33
- | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/paper.png" width="180"/><br>[🔗 论文链接](https://arxiv.org/pdf/1509.01626) | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/hinton_head.jpeg" width="180"/> <br>Hinton的图像| <img src="assets/sound.png" width="180"/><br>[🔗 音频样本](https://github.com/showlab/Paper2Video/blob/page/assets/hinton/ref_audio_10.wav) |
34
-
35
-
36
- - **输出:** 演讲视频
37
-
38
-
39
-
40
- https://github.com/user-attachments/assets/39221a9a-48cb-4e20-9d1c-080a5d8379c4
41
-
42
-
43
-
44
-
45
- 查看更多生成结果 [🌐 project page](https://showlab.github.io/Paper2Video/).
46
-
47
- ## 🔥 Update
48
- - [x] [2025.10.11] 我们的工作在[YC Hacker News](https://news.ycombinator.com/item?id=45553701)上受到关注.
49
- - [x] [2025.10.9] 感谢AK在[Twitter](https://x.com/_akhaliq/status/1976099830004072849)上分享我们的工作!
50
- - [x] [2025.10.9] 我们的工作被 [Medium](https://medium.com/@dataism/how-ai-learned-to-make-scientific-videos-from-slides-to-a-talking-head-0d807e491b27)报道.
51
- - [x] [2025.10.8] 下方查看我们的demo视频!
52
- - [x] [2025.10.7] 我们发布了 [Arxiv 论文](https://arxiv.org/abs/2510.05096).
53
- - [x] [2025.10.6] 我们发布了 [代码](https://github.com/showlab/Paper2Video) and [数据集](https://huggingface.co/datasets/ZaynZhu/Paper2Video).
54
- - [x] [2025.9.28] Paper2Video 已经被 **Scaling Environments for Agents Workshop([SEA](https://sea-workshop.github.io/)) at NeurIPS 2025** 接受.
55
-
56
-
57
- https://github.com/user-attachments/assets/a655e3c7-9d76-4c48-b946-1068fdb6cdd9
58
-
59
-
60
-
61
-
62
- ---
63
-
64
- ### Table of Contents
65
- - [🌟 项目总览](#-项目总览)
66
- - [🚀 快速上手: PaperTalker](#-快速上手-PaperTalker)
67
- - [1. 环境配置](#1-环境配置)
68
- - [2. 大语言模型配置](#2-大语言模型配置)
69
- - [3. 推理](#3-推理)
70
- - [📊 评价指标: Paper2Video](#-评价指标-Paper2Video)
71
- - [😼 乐趣: Paper2Video 生成 Paper2Video 演讲视频](#-乐趣-Paper2Video生成Paper2Video演讲视频)
72
- - [🙏 致谢](#-致谢)
73
- - [📌 引用](#-引用)
74
- ---
75
-
76
- ## 🌟 项目总览
77
- <p align="center">
78
- <img src="assets/teaser.png" alt="Overview" width="100%">
79
- </p>
80
-
81
- 这项工作解决了学术演讲的两个核心问题:
82
-
83
- - **左边: 如何根据论文制作学术演讲?**
84
- *PaperTalker* — 集成**幻灯片**、**字幕**、**光标**、**语音合成**和**演讲者视频渲染**的多智能体。
85
-
86
- - **右边: 如何评估学术演讲视频?**
87
- *Paper2Video* — 一个具有精心设计的指标来评估演示质量的基准。
88
-
89
-
90
- ---
91
-
92
- ## 🚀 尝试 PaperTalker 为你的论文制作演讲视频 !
93
- <p align="center">
94
- <img src="assets/method.png" alt="Approach" width="100%">
95
- </p>
96
-
97
- ### 1. 环境配置
98
- 准备Python环境:
99
- ```bash
100
- cd src
101
- conda create -n p2v python=3.10
102
- conda activate p2v
103
- pip install -r requirements.txt
104
- conda install -c conda-forge tectonic
105
- ````
106
- 下载所依赖代码,并按照[Hallo2](https://github.com/fudan-generative-vision/hallo2)中的说明下载模型权重。
107
- ```bash
108
- git clone https://github.com/fudan-generative-vision/hallo2.git
109
- ```
110
- 您需要**单独准备用于 talking-head generation 的环境**,以避免潜在的软件包冲突,请参考<a href="git clone https://github.com/fudan-generative-vision/hallo2.git">Hallo2</a>。安装完成后,使用 `which python` 命令获取 Python 环境路径。
111
- ```bash
112
- cd hallo2
113
- conda create -n hallo python=3.10
114
- conda activate hallo
115
- pip install -r requirements.txt
116
- ```
117
-
118
- ### 2. 大语言模型配置
119
- 在终端配置您的**API 凭证**:
120
- ```bash
121
- export GEMINI_API_KEY="your_gemini_key_here"
122
- export OPENAI_API_KEY="your_openai_key_here"
123
- ```
124
- 最佳实践是针对 LLM 和 VLM 使用 **GPT4.1** 或 **Gemini2.5-Pro**。我们也支持本地部署开源模型(例如 Qwen),详情请参阅 <a href="https://github.com/Paper2Poster/Paper2Poster.git">Paper2Poster</a>。
125
-
126
- ### 3. 推理
127
- 脚本 `pipeline.py` 提供了一个自动化的学���演示视频生成流程。它以 **LaTeX 论文素材** 和 **参考图像/音频** 作为输入,并经过多个子模块(幻灯片 → 字幕 → 语音 → 光标 → 头部特写)生成完整的演示视频。⚡ 运行此流程的最低推荐 GPU 为 **NVIDIA A6000**,显存 48G。
128
-
129
- #### 示例用法
130
-
131
- 运行以下命令来启动完整生成:
132
-
133
- ```bash
134
- python pipeline.py \
135
- --model_name_t gpt-4.1 \
136
- --model_name_v gpt-4.1 \
137
- --model_name_talking hallo2 \
138
- --result_dir /path/to/output \
139
- --paper_latex_root /path/to/latex_proj \
140
- --ref_img /path/to/ref_img.png \
141
- --ref_audio /path/to/ref_audio.wav \
142
- --talking_head_env /path/to/hallo2_env \
143
- --gpu_list [0,1,2,3,4,5,6,7]
144
- ```
145
-
146
- | 参数名 | 类型 | 默认值 | 说明 |
147
- |----------|------|---------|-------------|
148
- | `--model_name_t` | `str` | `gpt-4.1` | 文本大语言模型(LLM) |
149
- | `--model_name_v` | `str` | `gpt-4.1` | 视觉语言模型(VLM) |
150
- | `--model_name_talking` | `str` | `hallo2` | Talking Head 模型。目前仅支持 **hallo2** |
151
- | `--result_dir` | `str` | `/path/to/output` | 输出目录(包括幻灯片、字幕、视频等) |
152
- | `--paper_latex_root` | `str` | `/path/to/latex_proj` | 论文 LaTeX 项目的根目录 |
153
- | `--ref_img` | `str` | `/path/to/ref_img.png` | 参考图像(必须为**正方形**人像) |
154
- | `--ref_audio` | `str` | `/path/to/ref_audio.wav` | 参考音频(建议时长约为 10 秒) |
155
- | `--ref_text` | `str` | `None` | 可选参考文本(用于字幕风格指导) |
156
- | `--beamer_templete_prompt` | `str` | `None` | 可选参考文本(用于幻灯片风格指导) |
157
- | `--gpu_list` | `list[int]` | `""` | GPU 列表,用于并行执行(适用于**光标生成**与 **Talking Head 渲染**) |
158
- | `--if_tree_search` | `bool` | `True` | 是否启用树搜索(用于幻灯片布局优化) |
159
- | `--stage` | `str` | `"[0]"` | 需要运行的阶段(例如 `[0]` 表示完整流程,`[1,2,3]` 表示部分阶段) |
160
- | `--talking_head_env` | `str` | `/path/to/hallo2_env` | Talking Head 生成的 Python 环境路径 |
161
- ---
162
-
163
- ## 📊 评价指标: Paper2Video
164
- <p align="center">
165
- <img src="assets/metrics.png" alt="Metrics" width="100%">
166
- </p>
167
-
168
- 与自然视频生成不同,学术演示视频发挥着高度专业化的作用:它们不仅关乎视觉保真度,更关乎**学术交流**。这使得直接应用视频合成中的传统指标(例如 FVD、IS 或基于 CLIP 的相似度)变得困难。相反,它们的价值在于它们如何有效地**传播研究成果**并**提升学术知名度**。从这个角度来看,我们认为,评判高质量的学术演示视频应该从两个互补的维度进行评判:
169
- #### 对于观众
170
- - 视频应**忠实传达论文的核心思想**。
171
- - 视频应**易于不同受众观看**。
172
-
173
- #### 对于作者
174
- - 视频应**突出作者的智力贡献和身份**。
175
- - 视频应**提升作品的知名度和影响力**。
176
-
177
- 为了实现这些目标,我们引入了专门为学术演示视频设计的评估指标:Meta Similarity, PresentArena, PresentQuiz, IP Memory.
178
-
179
- ### 运行评价
180
- - 准备环境:
181
- ```bash
182
- cd src/evaluation
183
- conda create -n p2v_e python=3.10
184
- conda activate p2v_e
185
- pip install -r requirements.txt
186
- ```
187
- - 对于 Meta Similarity 和 PresentArena:
188
- ```bash
189
- python MetaSim_audio.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
190
- python MetaSim_content.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
191
- ```
192
- ```bash
193
- python PresentArena.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
194
- ```
195
- - 对于**PresentQuiz**,首先基于论文生成问题并使用 Gemini 进行评估:
196
- ```bash
197
- cd PresentQuiz
198
- python create_paper_questions.py ----paper_folder /path/to/data
199
- python PresentQuiz.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
200
- ```
201
-
202
- - 对于**IP Memory**,首先从生成的视频中生成问题对,然后使用 Gemini 进行评估:
203
- ```bash
204
- cd IPMemory
205
- python construct.py
206
- python ip_qa.py
207
- ```
208
- 更多详情请查看代码!
209
-
210
- 👉 Paper2Video 数据集可在以下网址获取:
211
- [HuggingFace](https://huggingface.co/datasets/ZaynZhu/Paper2Video)
212
-
213
- ---
214
-
215
- ## 😼 乐趣: Paper2Video 生成 Paper2Video 演讲视频
216
- 查看 **Paper2Video 生成 Paper2Video 演讲视频**:
217
-
218
- https://github.com/user-attachments/assets/ff58f4d8-8376-4e12-b967-711118adf3c4
219
-
220
- ## 🙏 致谢
221
-
222
- * 数据集中演示视频的来源是 SlideLive 和 YouTube。
223
- * 感谢所有为制作演示视频付出辛勤努力的作者!
224
- * 感谢 [CAMEL](https://github.com/camel-ai/camel) 开源了组织良好的多智能体框架代码库。
225
- * 感谢 [Hallo2](https://github.com/fudan-generative-vision/hallo2.git) 和 [Paper2Poster](https://github.com/Paper2Poster/Paper2Poster.git) 作者开源代码。
226
- * 感谢 [Wei Jia](https://github.com/weeadd) 在数据收集和baselines实现方面所做的努力。我们也感谢所有参与用户调研的参与者。
227
- * 感谢所有 **Show Lab @ NUS** 成员的支持!
228
-
229
-
230
-
231
- ---
232
-
233
- ## ��� 引用
234
-
235
-
236
- 如果我们的工作对您有帮助,欢迎引用我们的工作:
237
-
238
- ```bibtex
239
- @misc{paper2video,
240
- title={Paper2Video: Automatic Video Generation from Scientific Papers},
241
- author={Zeyu Zhu and Kevin Qinghong Lin and Mike Zheng Shou},
242
- year={2025},
243
- eprint={2510.05096},
244
- archivePrefix={arXiv},
245
- primaryClass={cs.CV},
246
- url={https://arxiv.org/abs/2510.05096},
247
- }
248
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/README.md DELETED
@@ -1,251 +0,0 @@
1
- # Paper2Video
2
-
3
- <p align="right">
4
- <b>English</b> | <a href="./README-CN.md">简体中文</a>
5
- </p>
6
-
7
-
8
- <p align="center">
9
- <b>Paper2Video: Automatic Video Generation from Scientific Papers</b>
10
- <br>
11
- 从学术论文自动生成演讲视频
12
- </p>
13
-
14
- <p align="center">
15
- <a href="https://zeyu-zhu.github.io/webpage/">Zeyu Zhu*</a>,
16
- <a href="https://qhlin.me/">Kevin Qinghong Lin*</a>,
17
- <a href="https://scholar.google.com/citations?user=h1-3lSoAAAAJ&hl=en">Mike Zheng Shou</a> <br>
18
- Show Lab, National University of Singapore
19
- </p>
20
-
21
-
22
- <p align="center">
23
-   <a href="https://arxiv.org/abs/2510.05096">📄 Paper</a> &nbsp; | &nbsp;
24
- <a href="https://huggingface.co/papers/2510.05096">🤗 Daily Paper</a> &nbsp; | &nbsp;
25
-   <a href="https://huggingface.co/datasets/ZaynZhu/Paper2Video">📊 Dataset</a> &nbsp; | &nbsp;
26
-   <a href="https://showlab.github.io/Paper2Video/">🌐 Project Website</a> &nbsp; | &nbsp;
27
-   <a href="https://x.com/KevinQHLin/status/1976105129146257542">💬 X (Twitter)</a>
28
- </p>
29
-
30
- - **Input:** a paper ➕ an image ➕ an audio
31
-
32
- | Paper | Image | Audio |
33
- |--------|--------|--------|
34
- | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/paper.png" width="180"/><br>[🔗 Paper link](https://arxiv.org/pdf/1509.01626) | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/hinton_head.jpeg" width="180"/> <br>Hinton's photo| <img src="assets/sound.png" width="180"/><br>[🔗 Audio sample](https://github.com/showlab/Paper2Video/blob/page/assets/hinton/ref_audio_10.wav) |
35
-
36
-
37
- - **Output:** a presentation video
38
-
39
-
40
-
41
- https://github.com/user-attachments/assets/39221a9a-48cb-4e20-9d1c-080a5d8379c4
42
-
43
-
44
-
45
-
46
- Check out more examples at [🌐 project page](https://showlab.github.io/Paper2Video/).
47
-
48
- ## 🔥 Update
49
- - [x] [2025.10.11] Our work receives attention on [YC Hacker News](https://news.ycombinator.com/item?id=45553701).
50
- - [x] [2025.10.9] Thanks AK for sharing our work on [Twitter](https://x.com/_akhaliq/status/1976099830004072849)!
51
- - [x] [2025.10.9] Our work is reported by [Medium](https://medium.com/@dataism/how-ai-learned-to-make-scientific-videos-from-slides-to-a-talking-head-0d807e491b27).
52
- - [x] [2025.10.8] Check out our demo video below!
53
- - [x] [2025.10.7] We release the [arxiv paper](https://arxiv.org/abs/2510.05096).
54
- - [x] [2025.10.6] We release the [code](https://github.com/showlab/Paper2Video) and [dataset](https://huggingface.co/datasets/ZaynZhu/Paper2Video).
55
- - [x] [2025.9.28] Paper2Video has been accepted to the **Scaling Environments for Agents Workshop([SEA](https://sea-workshop.github.io/)) at NeurIPS 2025**.
56
-
57
-
58
- https://github.com/user-attachments/assets/a655e3c7-9d76-4c48-b946-1068fdb6cdd9
59
-
60
-
61
-
62
-
63
- ---
64
-
65
- ### Table of Contents
66
- - [🌟 Overview](#-overview)
67
- - [🚀 Quick Start: PaperTalker](#-try-papertalker-for-your-paper-)
68
- - [1. Requirements](#1-requirements)
69
- - [2. Configure LLMs](#2-configure-llms)
70
- - [3. Inference](#3-inference)
71
- - [📊 Evaluation: Paper2Video](#-evaluation-paper2video)
72
- - [😼 Fun: Paper2Video for Paper2Video](#-fun-paper2video-for-paper2video)
73
- - [🙏 Acknowledgements](#-acknowledgements)
74
- - [📌 Citation](#-citation)
75
-
76
- ---
77
-
78
- ## 🌟 Overview
79
- <p align="center">
80
- <img src="assets/teaser.png" alt="Overview" width="100%">
81
- </p>
82
-
83
- This work solves two core problems for academic presentations:
84
-
85
- - **Left: How to create a presentation video from a paper?**
86
- *PaperTalker* — an agent that integrates **slides**, **subtitling**, **cursor grounding**, **speech synthesis**, and **talking-head video rendering**.
87
-
88
- - **Right: How to evaluate a presentation video?**
89
- *Paper2Video* — a benchmark with well-designed metrics to evaluate presentation quality.
90
-
91
-
92
- ---
93
-
94
- ## 🚀 Try PaperTalker for your Paper!
95
- <p align="center">
96
- <img src="assets/method.png" alt="Approach" width="100%">
97
- </p>
98
-
99
- ### 1. Requirements
100
- Prepare the environment:
101
- ```bash
102
- cd src
103
- conda create -n p2v python=3.10
104
- conda activate p2v
105
- pip install -r requirements.txt
106
- conda install -c conda-forge tectonic
107
- ````
108
- Download the dependent code and follow the instructions in **[Hallo2](https://github.com/fudan-generative-vision/hallo2)** to download the model weight.
109
- ```bash
110
- git clone https://github.com/fudan-generative-vision/hallo2.git
111
- ```
112
- You need to **prepare the environment separately for talking-head generation** to potential avoide package conflicts, please refer to <a href="git clone https://github.com/fudan-generative-vision/hallo2.git">Hallo2</a>. After installing, use `which python` to get the python environment path.
113
- ```bash
114
- cd hallo2
115
- conda create -n hallo python=3.10
116
- conda activate hallo
117
- pip install -r requirements.txt
118
- ```
119
-
120
- ### 2. Configure LLMs
121
- Export your **API credentials**:
122
- ```bash
123
- export GEMINI_API_KEY="your_gemini_key_here"
124
- export OPENAI_API_KEY="your_openai_key_here"
125
- ```
126
- The best practice is to use **GPT4.1** or **Gemini2.5-Pro** for both LLM and VLMs. We also support locally deployed open-source model(e.g., Qwen), details please referring to <a href="https://github.com/Paper2Poster/Paper2Poster.git">Paper2Poster</a>.
127
-
128
- ### 3. Inference
129
- The script `pipeline.py` provides an automated pipeline for generating academic presentation videos. It takes **LaTeX paper sources** together with **reference image/audio** as input, and goes through multiple sub-modules (Slides → Subtitles → Speech → Cursor → Talking Head) to produce a complete presentation video. ⚡ The minimum recommended GPU for running this pipeline is **NVIDIA A6000** with 48G.
130
-
131
- #### Example Usage
132
-
133
- Run the following command to launch a full generation:
134
-
135
- ```bash
136
- python pipeline.py \
137
- --model_name_t gpt-4.1 \
138
- --model_name_v gpt-4.1 \
139
- --model_name_talking hallo2 \
140
- --result_dir /path/to/output \
141
- --paper_latex_root /path/to/latex_proj \
142
- --ref_img /path/to/ref_img.png \
143
- --ref_audio /path/to/ref_audio.wav \
144
- --talking_head_env /path/to/hallo2_env \
145
- --gpu_list [0,1,2,3,4,5,6,7]
146
- ```
147
-
148
- | Argument | Type | Default | Description |
149
- |----------|------|---------|-------------|
150
- | `--model_name_t` | `str` | `gpt-4.1` | LLM |
151
- | `--model_name_v` | `str` | `gpt-4.1` | VLM |
152
- | `--model_name_talking` | `str` | `hallo2` | Talking Head model. Currently only **hallo2** is supported |
153
- | `--result_dir` | `str` | `/path/to/output` | Output directory (slides, subtitles, videos, etc.) |
154
- | `--paper_latex_root` | `str` | `/path/to/latex_proj` | Root directory of the LaTeX paper project |
155
- | `--ref_img` | `str` | `/path/to/ref_img.png` | Reference image (must be **square** portrait) |
156
- | `--ref_audio` | `str` | `/path/to/ref_audio.wav` | Reference audio (recommended: ~10s) |
157
- | `--ref_text` | `str` | `None` | Optional reference text (for style guidance for subtitles) |
158
- | `--beamer_templete_prompt` | `str` | `None` | Optional reference text (for style guidance for slides) |
159
- | `--gpu_list` | `list[int]` | `""` | GPU list for parallel execution (used in **cursor generation** and **Talking Head rendering**) |
160
- | `--if_tree_search` | `bool` | `True` | Whether to enable tree search for slide layout refinement |
161
- | `--stage` | `str` | `"[0]"` | Pipeline stages to run (e.g., `[0]` full pipeline, `[1,2,3]` partial stages) |
162
- | `--talking_head_env` | `str` | `/path/to/hallo2_env` | python environment path for talking-head generation |
163
- ---
164
-
165
- ## 📊 Evaluation: Paper2Video
166
- <p align="center">
167
- <img src="assets/metrics.png" alt="Metrics" width="100%">
168
- </p>
169
-
170
- Unlike natural video generation, academic presentation videos serve a highly specialized role: they are not merely about visual fidelity but about **communicating scholarship**. This makes it difficult to directly apply conventional metrics from video synthesis(e.g., FVD, IS, or CLIP-based similarity). Instead, their value lies in how well they **disseminate research** and **amplify scholarly visibility**.From this perspective, we argue that a high-quality academic presentation video should be judged along two complementary dimensions:
171
- #### For the Audience
172
- - The video is expected to **faithfully convey the paper’s core ideas**.
173
- - It should remain **accessible to diverse audiences**.
174
-
175
- #### For the Author
176
- - The video should **foreground the authors’ intellectual contribution and identity**.
177
- - It should **enhance the work’s visibility and impact**.
178
-
179
- To capture these goals, we introduce evaluation metrics specifically designed for academic presentation videos: Meta Similarity, PresentArena, PresentQuiz, IP Memory.
180
-
181
- ### Run Eval
182
- - Prepare the environment:
183
- ```bash
184
- cd src/evaluation
185
- conda create -n p2v_e python=3.10
186
- conda activate p2v_e
187
- pip install -r requirements.txt
188
- ```
189
- - For MetaSimilarity and PresentArena:
190
- ```bash
191
- python MetaSim_audio.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
192
- python MetaSim_content.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
193
- ```
194
- ```bash
195
- python PresentArena.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
196
- ```
197
- - For **PresentQuiz**, first generate questions from paper and eval using Gemini:
198
- ```bash
199
- cd PresentQuiz
200
- python create_paper_questions.py ----paper_folder /path/to/data
201
- python PresentQuiz.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
202
- ```
203
-
204
- - For **IP Memory**, first generate question pairs from generated videos and eval using Gemini:
205
- ```bash
206
- cd IPMemory
207
- python construct.py
208
- python ip_qa.py
209
- ```
210
- See the codes for more details!
211
-
212
- 👉 Paper2Video Benchmark is available at:
213
- [HuggingFace](https://huggingface.co/datasets/ZaynZhu/Paper2Video)
214
-
215
- ---
216
-
217
- ## 😼 Fun: Paper2Video for Paper2Video
218
- Check out **How Paper2Video for Paper2Video**:
219
-
220
- https://github.com/user-attachments/assets/ff58f4d8-8376-4e12-b967-711118adf3c4
221
-
222
- ## 🙏 Acknowledgements
223
-
224
- * The souces of the presentation videos are SlideLive and YouTuBe.
225
- * We thank all the authors who spend a great effort to create presentation videos!
226
- * We thank [CAMEL](https://github.com/camel-ai/camel) for open-source well-organized multi-agent framework codebase.
227
- * We thank the authors of [Hallo2](https://github.com/fudan-generative-vision/hallo2.git) and [Paper2Poster](https://github.com/Paper2Poster/Paper2Poster.git) for their open-sourced codes.
228
- * We thank [Wei Jia](https://github.com/weeadd) for his effort in collecting the data and implementing the baselines. We also thank all the participants involved in the human studies.
229
- * We thank all the **Show Lab @ NUS** members for support!
230
-
231
-
232
-
233
- ---
234
-
235
- ## 📌 Citation
236
-
237
-
238
- If you find our work useful, please cite:
239
-
240
- ```bibtex
241
- @misc{paper2video,
242
- title={Paper2Video: Automatic Video Generation from Scientific Papers},
243
- author={Zeyu Zhu and Kevin Qinghong Lin and Mike Zheng Shou},
244
- year={2025},
245
- eprint={2510.05096},
246
- archivePrefix={arXiv},
247
- primaryClass={cs.CV},
248
- url={https://arxiv.org/abs/2510.05096},
249
- }
250
- ```
251
- [![Star History](https://api.star-history.com/svg?repos=showlab/Paper2Video&type=Date)](https://star-history.com/#showlab/Paper2Video&Date)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/__init__.py DELETED
File without changes
Paper2Video/src/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/IPMemory/construct.py DELETED
@@ -1,69 +0,0 @@
1
- """
2
- construct question about Academic IP
3
- input query: 4 video clips from 4 different paper presentation + query (image/audio)
4
- input question: 4 understanding qa from corresponding paper
5
- output task: choose the right question to ask
6
- """
7
- import os, re
8
- import json
9
- import random
10
- import itertools
11
- from os import path
12
- from typing import List
13
- from pathlib import Path
14
- from tqdm import tqdm
15
-
16
- def generate_combinations(total_num, comb_size):
17
- return list(itertools.combinations(range(total_num), comb_size))
18
-
19
- def generate_ip_task(vaild_data_name, num_qa_pair):
20
- combs = list(itertools.combinations(range(len(vaild_data_name)), 4))
21
- combs = random.sample(combs, num_qa_pair)
22
-
23
- qa_list = []
24
- for comb in combs:
25
- ## questions
26
- question_list = []
27
- question_index = random.randint(1, 50)
28
- for index in comb:
29
- question_path = path.join(vaild_data_name[index][1], "4o-mini_qa.json")
30
- with open(question_path, 'r') as f: question = json.load(f)["understanding"]["questions"]
31
- question_list.append(question["Question {}".format(str(question_index))]["question"])
32
- ## query
33
- query_list = []
34
- for index in comb:
35
- ref_img_path = path.join(vaild_data_name[index][1], "ref_img.png")
36
- ref_audio_path = path.join(vaild_data_name[index][1], "ref_audio.wav")
37
- query_list.append((ref_img_path, ref_audio_path))
38
- ## qa
39
- qa = {}
40
- qa["videos"] = []
41
- for idx in range(len(comb)):
42
- qa["videos"].append(vaild_data_name[comb[idx]][0])
43
-
44
- qa["querys"] = query_list
45
- qa["questions"] = question_list
46
- qa_list.append(qa)
47
- with open("ip_qa.json", 'w') as f: json.dump(qa_list, f, indent=4)
48
-
49
- _num_at_start = re.compile(r'^\s*["\']?(\d+)')
50
- def sort_by_leading_number(paths: List[str]) -> List[str]:
51
- def key(p: str):
52
- name = Path(p).name
53
- m = _num_at_start.match(name)
54
- return (int(m.group(1)) if m else float('inf'), name)
55
- return sorted(paths, key=key)
56
-
57
- if __name__ == "__main__":
58
- num_qa_pair = 10 # C (num_data) (4)
59
- root_dir = "/path/to/result"
60
- gt_dir = "/path/to/data"
61
-
62
- all_data_name = sort_by_leading_number(os.listdir(root_dir))
63
- all_groundtruth = sort_by_leading_number(os.listdir(gt_dir))
64
- vaild_data_name = []
65
- for data_idx in range(len(all_data_name)):
66
- if path.basename(root_dir) == "paper2video":
67
- video_result_1 = path.join(root_dir, all_data_name[data_idx], "3_merage.mp4")
68
- video_result_2 = path.join(root_dir.replace("paper2video", "presentagent"), all_data_name[data_idx], "result.mp4")
69
- generate_ip_task(vaild_data_name, num_qa_pair)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/IPMemory/ip_qa.py DELETED
@@ -1,142 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import time
5
- import random
6
- import argparse, pdb
7
- from os import path
8
- import google.generativeai as genai
9
- from moviepy.editor import VideoFileClip
10
- from camel.models import ModelFactory
11
- from camel.types import ModelType, ModelPlatformType
12
- from camel.configs import GeminiConfig
13
- from typing import List
14
- from pathlib import Path
15
-
16
-
17
- genai.configure(api_key="")
18
-
19
- _num_at_start = re.compile(r'^\s*["\']?(\d+)')
20
- def sort_by_leading_number(paths: List[str]) -> List[str]:
21
- def key(p: str):
22
- name = Path(p).name
23
- m = _num_at_start.match(name)
24
- return (int(m.group(1)) if m else float('inf'), name)
25
- return sorted(paths, key=key)
26
- dataset_path = "/path/to/data"
27
- dataset_list = sort_by_leading_number(os.listdir(dataset_path))
28
-
29
-
30
- def eval_ip(root_path, clip_duration, model_list, prompt_path, question_path, test_type='image'):
31
- tmp_dir = "tmp"
32
- os.makedirs(tmp_dir, exist_ok=True)
33
- gemini_model = genai.GenerativeModel("models/gemini-2.5-pro-flash")
34
-
35
- with open(prompt_path, 'r') as f: prompt = f.readlines()
36
- prompt = "/n".join(prompt)
37
- with open(question_path, 'r') as f: questions = json.load(f)
38
-
39
- result_each_question = []
40
- for question in questions:
41
- video_ids = question["videos"]
42
- querys = question["querys"]
43
- qs = question["questions"]
44
-
45
- ## get video clips
46
- video_clips_path = {}
47
- for model in model_list: video_clips_path[model] = []
48
-
49
- start_p2v = None
50
- for vid_id in video_ids:
51
- tmp_dir_id = path.join(tmp_dir, str(vid_id))
52
- os.makedirs(tmp_dir_id, exist_ok=True)
53
- for model in model_list:
54
- if model == 'p2v': video_path = path.join(root_path, "paper2video", str(vid_id), '3_merage.mp4')
55
- elif model == 'p2v-o': video_path = path.join(root_path, "paper2video_wo_presenter", str(vid_id), 'result.mp4')
56
- elif model == 'veo3': video_path = path.join(root_path, "veo3", str(vid_id)+".mp4")
57
- elif model == 'wan2.2': video_path = path.join(root_path, "wan2.2", str(int(vid_id)-1), "result.mp4")
58
- elif model == 'presentagent': video_path = path.join(root_path, "presentagent", str(vid_id), "result.mp4")
59
- elif model == 'human-made': video_path = path.join(dataset_path, dataset_list[int(vid_id)-1], "gt_presentation_video.mp4")
60
-
61
- video = VideoFileClip(video_path)
62
- start = random.uniform(0, video.duration-clip_duration-1)
63
- end = start + clip_duration
64
- if model == 'p2v' or model == "p2v-o":
65
- if start_p2v is None:
66
- start_p2v = random.uniform(0, video.duration-clip_duration-1)
67
- start = start_p2v
68
- end = start_p2v + clip_duration
69
- else:
70
- start = start_p2v
71
- end = start_p2v + clip_duration
72
- else:
73
- start = random.uniform(0, video.duration-clip_duration-1)
74
- end = start + clip_duration
75
-
76
- clip_save_path = path.join(tmp_dir_id, model+".mp4")
77
- subclip = video.subclip(start, end)
78
- subclip.write_videofile(clip_save_path, codec="libx264", audio_codec="aac")
79
- video_clips_path[model].append(clip_save_path)
80
- ## test for each model, 4 qas
81
- result_each_model = {}
82
- for model in model_list:
83
- video_input = video_clips_path[model]
84
- videos = upload_videos(video_input)
85
- result_each_model[model] = []
86
- for idx, query in enumerate(querys):
87
- if test_type == 'image':
88
- query = query[0]
89
- query_state = genai.upload_file(path=query, mime_type="image/png")
90
- elif test_type == 'aduio':
91
- query = query[1]
92
-
93
- answer = idx
94
- ori_idxs = [0, 1, 2, 3]
95
- shuffled_idx = ori_idxs.copy()
96
- random.shuffle(shuffled_idx)
97
- mapping = {orig: shuffled for orig, shuffled in zip(ori_idxs, shuffled_idx)}
98
- new_answer = mapping[idx]
99
- new_qs = [qs[mapping[idx]] for idx in ori_idxs]
100
-
101
- contents = [prompt, "Here are the quary", genai.get_file(query_state.name), "Here are the video clips"]
102
- contents.extend(videos)
103
- contents.extend(["Here are the questions"])
104
- contents.extend(new_qs)
105
-
106
- response = gemini_model.generate_content(contents)
107
- #pdb.set_trace()
108
- match = re.search(r"My choice:\s*(\d+)", response.text)
109
- if match: choice_num = int(match.group(1)) - 1
110
- if choice_num == new_answer:
111
- result_each_model[model].append([query, new_qs, choice_num, new_answer, True])
112
- else:
113
- result_each_model[model].append([query, new_qs, choice_num, new_answer, False])
114
- result_each_question.append(result_each_model)
115
- print(result_each_question)
116
- with open("ip_qa_result.json", 'w') as f: json.dump(result_each_question, f, indent=4)
117
-
118
- def upload_videos(video_list):
119
- videos = video_list.copy()
120
- for idx, value in enumerate(videos):
121
- videos[idx] = genai.upload_file(path=value, mime_type="video/mp4")
122
- while True:
123
- flag = True
124
- for idx, value in enumerate(videos):
125
- file_state = genai.get_file(videos[idx].name)
126
- if file_state.state.name != "ACTIVE":
127
- flag = False
128
- time.sleep(5)
129
- print(f"waiting 5 seconds...")
130
- break
131
- if flag: break
132
- for idx, value in enumerate(videos):
133
- videos[idx] = genai.get_file(videos[idx].name)
134
- return videos
135
-
136
- if __name__ == "__main__":
137
- clip_duration = 4
138
- prompt_path = "./prompt/ip_qa.txt"
139
- model_list = ["p2v", "p2v-o", "veo3", "wan2.2", "presentagent", "human-made"]
140
- root_path = "/path/to/result"
141
- question_path = "ip_qa.json"
142
- eval_ip(root_path, clip_duration, model_list, prompt_path, question_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/MetaSim_audio.py DELETED
@@ -1,102 +0,0 @@
1
- import os, re, json
2
- import random
3
- import argparse
4
- import moviepy.editor as mp
5
- from os import path
6
- from pathlib import Path
7
- from typing import List
8
- from pyannote.audio import Audio
9
- from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
10
- from scipy.spatial.distance import cosine
11
-
12
-
13
- def extract_random_audio_segment(video_path: str, output_wav_path: str, duration: float = 5.0):
14
- print(video_path)
15
- video = mp.VideoFileClip(video_path)
16
- audio = video.audio
17
-
18
- total_duration = audio.duration
19
- if duration >= total_duration: start_time = 0
20
- else: start_time = random.uniform(0, total_duration - duration)
21
-
22
- audio_subclip = audio.subclip(start_time, start_time + duration)
23
- audio_subclip.write_audiofile(output_wav_path, codec='pcm_s16le', fps=16000)
24
-
25
- def compute_speaker_similarity(audio_path_1: str, audio_path_2: str, device: str = "cuda") -> float:
26
- embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=device)
27
- audio_loader = Audio(sample_rate=16000)
28
-
29
- wav1, _ = audio_loader(audio_path_1)
30
- wav2, _ = audio_loader(audio_path_2)
31
-
32
- wav1 = wav1[0:1].unsqueeze(0)
33
- wav2 = wav2[0:1].unsqueeze(0)
34
-
35
- embedding1 = embedding_model(wav1)
36
- embedding2 = embedding_model(wav2)
37
- embedding1 = embedding1.reshape(embedding1.shape[1])
38
- embedding2 = embedding2.reshape(embedding2.shape[1])
39
-
40
- similarity = 1 - cosine(embedding1, embedding2)
41
- return similarity
42
-
43
-
44
- def get_audio_sim_score(gen_video_path, gt_video_path):
45
- extract_random_audio_segment(gen_video_path, gen_video_path.replace('.mp4', '.wav'), duration=5)
46
- extract_random_audio_segment(gt_video_path, gt_video_path.replace('.mp4', '.wav'), duration=5)
47
- similarity = compute_speaker_similarity(gen_video_path.replace('.mp4', '.wav'),
48
- gt_video_path.replace('.mp4', '.wav'))
49
- return similarity
50
-
51
- _num_at_start = re.compile(r'^\s*["\']?(\d+)')
52
- def sort_by_leading_number(paths: List[str]) -> List[str]:
53
- def key(p: str):
54
- name = Path(p).name
55
- m = _num_at_start.match(name)
56
- return (int(m.group(1)) if m else float('inf'), name)
57
- return sorted(paths, key=key)
58
-
59
- if __name__ == "__main__":
60
- parser = argparse.ArgumentParser()
61
- parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
62
- parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
63
- parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
64
- args = parser.parse_args()
65
-
66
- ## load exist result if have
67
- save_dir = args.save_dir
68
- save_dir = path.join(save_dir, path.basename(args.result_dir))
69
- save_path = path.join(save_dir, "audio_sim.json")
70
- os.makedirs(save_dir, exist_ok=True)
71
- if path.exists(save_path):
72
- with open(save_path, 'r') as f: audio_similarity_list = json.load(f)
73
- else: audio_similarity_list = []
74
-
75
- ## path
76
- gt_dir, result_dir = args.gt_dir, args.result_dir
77
- groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
78
- result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
79
-
80
- for index in range(len(audio_similarity_list), 40):
81
- if path.basename(args.result_dir) == "paper2video":
82
- p2v_video_path = path.join(result_list[index], "3_merage.mp4")
83
- elif path.basename(args.result_dir) == "veo3":
84
- p2v_video_path = path.join(result_list[index])
85
- else:
86
- p2v_video_path = path.join(result_list[index], "result.mp4")
87
- if path.exists(p2v_video_path) is False: continue
88
- gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
89
- if path.exists(gt_video_path) is False: continue
90
- print(p2v_video_path, gt_video_path)
91
- similarity = get_audio_sim_score(p2v_video_path, gt_video_path)
92
- audio_similarity_list.append({
93
- "data_idx": index,
94
- "score": similarity.item()
95
- })
96
- print(audio_similarity_list)
97
- with open(save_path, 'w') as f: json.dump(audio_similarity_list, f, indent=4)
98
-
99
- # import numpy as np
100
- # avg = np.average(similarity_all)
101
- # var = np.var(similarity_all)
102
- # print(avg, var)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/MetaSim_content.py DELETED
@@ -1,144 +0,0 @@
1
- import os, re, pdb, json
2
- from PIL import Image
3
- import pytesseract
4
-
5
- import whisperx
6
- import argparse
7
- import torch
8
- import numpy as np
9
- from os import path
10
- from pathlib import Path
11
- from typing import List
12
- from camel.models import ModelFactory
13
- from camel.types import ModelType, ModelPlatformType
14
- from camel.configs import GeminiConfig
15
-
16
-
17
- os.environ["GEMINI_API_KEY"] = ""
18
- prompt_path = "./prompt/content_sim_score.txt"
19
-
20
- agent_config = {
21
- "model_type": ModelType.GEMINI_2_5_FLASH,
22
- "model_config": GeminiConfig().as_dict(),
23
- "model_platform": ModelPlatformType.GEMINI,}
24
- actor_model = ModelFactory.create(
25
- model_platform=agent_config['model_platform'],
26
- model_type=agent_config['model_type'],
27
- model_config_dict=agent_config['model_config'],)
28
-
29
- def extract_slide_texts(slide_dir):
30
- slide_texts = []
31
- for fname in sorted(os.listdir(slide_dir)):
32
- if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
33
- path = os.path.join(slide_dir, fname)
34
- text = pytesseract.image_to_string(Image.open(path))
35
- slide_texts.append(text.strip())
36
- return slide_texts
37
-
38
- def load_subtitles(sub_path):
39
- with open(sub_path, "r") as f:
40
- lines = f.readlines()
41
- return [line.strip() for line in lines if line.strip()]
42
-
43
- def build_prompt(slides_1, subs_1, slides_2, subs_2):
44
- prompt = (
45
- "Human Presentation:\n"
46
- "Slides:\n" + "\n".join(slides_1) + "\n"
47
- "Subtitles:\n" + "\n".join(subs_1) + "\n\n"
48
- "Generated Presentation:\n"
49
- "Slides:\n" + "\n".join(slides_2) + "\n"
50
- "Subtitles:\n" + "\n".join(subs_2) + "\n\n")
51
- return prompt
52
-
53
- def run_similarity_eval(slide_dir_1, slide_dir_2, sub_path_1, sub_path_2):
54
- slides_1 = extract_slide_texts(slide_dir_1)
55
- slides_2 = extract_slide_texts(slide_dir_2)
56
- subs_1 = load_subtitles(sub_path_1)
57
- subs_2 = load_subtitles(sub_path_2)
58
-
59
- with open(prompt_path, 'r') as f: prompt = f.readlines()
60
- prompt = "\n".join(prompt)
61
- prompt_q = build_prompt(slides_1, subs_1, slides_2, subs_2)
62
- prompt = prompt + '/n' + prompt_q
63
-
64
- output = actor_model.run([{"role": "user", "content": prompt}])
65
- print("=== Similarity Evaluation ===\n")
66
- print(output.choices[0].message.content)
67
- return output.choices[0].message.content
68
-
69
- def extract_plain_subtitle_with_whisperx(video_path: str, output_path: str, model_name: str = "large-v3", language: str = "en"):
70
- device = "cuda" if torch.cuda.is_available() else "cpu"
71
- model = whisperx.load_model(model_name, device=device, language=language)
72
-
73
- audio = whisperx.load_audio(video_path)
74
- result = model.transcribe(audio, batch_size=16)
75
-
76
- with open(output_path, "w") as f:
77
- for seg in result["segments"]:
78
- f.write(seg["text"].strip() + "\n")
79
-
80
- def extract_similarity_scores(text):
81
- content_match = re.search(r"Content Similarity:\s*(\d+)/5", text)
82
- if content_match:
83
- content_score = int(content_match.group(1))
84
- return content_score
85
-
86
- _num_at_start = re.compile(r'^\s*["\']?(\d+)')
87
- def sort_by_leading_number(paths: List[str]) -> List[str]:
88
- def key(p: str):
89
- name = Path(p).name
90
- m = _num_at_start.match(name)
91
- return (int(m.group(1)) if m else float('inf'), name)
92
- return sorted(paths, key=key)
93
-
94
- if __name__ == "__main__":
95
- parser = argparse.ArgumentParser()
96
- parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
97
- parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
98
- parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
99
- args = parser.parse_args()
100
-
101
- ## load exist result if have
102
- save_dir = args.save_dir
103
- save_dir = path.join(save_dir, path.basename(args.result_dir))
104
- save_path = path.join(save_dir, "content_sim.json")
105
- os.makedirs(save_dir, exist_ok=True)
106
- if path.exists(save_path):
107
- with open(save_path, 'r') as f: content_sim_list = json.load(f)
108
- else: content_sim_list = []
109
-
110
- ## path
111
- gt_dir, result_dir = args.gt_dir, args.result_dir
112
- groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
113
- result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
114
-
115
- ## eval
116
- for index in range(25, 100):
117
- # video -> subtitle
118
- if path.basename(args.result_dir) == "paper2video":
119
- p2v_video_path = path.join(result_list[index], "3_merage.mp4")
120
- if path.exists(p2v_video_path) is False: continue
121
- else:
122
- p2v_video_path = path.join(result_list[index], "result.mp4")
123
- if path.exists(p2v_video_path) is False: continue
124
- gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
125
- extract_plain_subtitle_with_whisperx(gt_video_path, gt_video_path.replace(".mp4", "_sub.txt"))
126
- extract_plain_subtitle_with_whisperx(p2v_video_path, p2v_video_path.replace(".mp4", "_sub.txt"))
127
-
128
- # slide dir
129
- gt_slide_dir = path.join(groundtruth_list[index], "slide_imgs")
130
- p2v_slide_dir = path.join(result_list[index], "slide_imgs")
131
-
132
- # eval
133
- result = run_similarity_eval(
134
- slide_dir_1=gt_slide_dir,
135
- slide_dir_2=p2v_slide_dir,
136
- sub_path_1=gt_video_path.replace(".mp4", "_sub.txt"),
137
- sub_path_2=p2v_video_path.replace(".mp4", "_sub.txt"))
138
- content_score = extract_similarity_scores(result)
139
- content_sim_list.append({
140
- "data_idx": index,
141
- "score": content_score
142
- })
143
-
144
- with open(save_path, 'w') as f: json.dump(content_sim_list, f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentArena.py DELETED
@@ -1,106 +0,0 @@
1
- '''
2
- Using VideoLLM (Gemini) as judger
3
- '''
4
- import os, re, json
5
- import time
6
- import argparse
7
- import google.generativeai as genai
8
- from os import path
9
- from typing import List
10
- from pathlib import Path
11
- from tqdm import tqdm
12
-
13
-
14
- genai.configure(api_key="")
15
- def eval_gemini(gt_vid_path, gen_vid_path):
16
- model = genai.GenerativeModel("models/gemini-2.5-pro")
17
- gt_vid = genai.upload_file(path=gt_vid_path, mime_type="video/mp4")
18
- gen_vid = genai.upload_file(path=gen_vid_path, mime_type="video/mp4")
19
- while True:
20
- refreshed_1 = genai.get_file(gt_vid.name)
21
- refreshed_2 = genai.get_file(gen_vid.name)
22
- if refreshed_1.state.name == "ACTIVE" and refreshed_2.state.name == "ACTIVE": break
23
- elif refreshed_1.state.name == "FAILED" or refreshed_2.state.name == "FAILED":
24
- #raise RuntimeError("❌ File processing failed.")
25
- return None
26
- else:
27
- print(f"waiting 5 seconds...")
28
- time.sleep(5)
29
-
30
- prompt_path = "./prompt/which_is_better.txt"
31
- with open(prompt_path, 'r') as f: prompt = f.readlines()
32
- prompt = "/n".join(prompt)
33
- print("Sending prompt to Gemini...")
34
- response = model.generate_content([prompt, refreshed_1, refreshed_2])
35
- print("\n===== Evaluation Result =====")
36
- print(response.text)
37
- print("=============================\n")
38
-
39
- return response.text
40
-
41
- _num_at_start = re.compile(r'^\s*["\']?(\d+)')
42
- def sort_by_leading_number(paths: List[str]) -> List[str]:
43
- def key(p: str):
44
- name = Path(p).name
45
- m = _num_at_start.match(name)
46
- return (int(m.group(1)) if m else float('inf'), name)
47
- return sorted(paths, key=key)
48
-
49
- if __name__ == "__main__":
50
- parser = argparse.ArgumentParser()
51
- parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
52
- parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
53
- parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
54
- args = parser.parse_args()
55
-
56
- ## load exist result if have
57
- save_dir = args.save_dir
58
- if path.basename(args.result_dir) == "paper2video":
59
- save_dir = path.join(save_dir, path.basename(args.result_dir))
60
- else: save_dir = path.join(save_dir, path.basename(args.result_dir))
61
-
62
- save_path = path.join(save_dir, "video_arena.json")
63
- os.makedirs(save_dir, exist_ok=True)
64
- if path.exists(save_path):
65
- with open(save_path, 'r') as f: arena_score_list = json.load(f)
66
- else: arena_score_list = []
67
-
68
- ## path
69
- gt_dir, result_dir = args.gt_dir, args.result_dir
70
- groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
71
- result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
72
-
73
- ## Generated v.s GT (1)
74
- for index in tqdm(len(result_list)):
75
- if path.basename(args.result_dir) == "paper2video":
76
- test_video_path = path.join(result_list[index], "3_merage.mp4")
77
- elif path.basename(args.result_dir) == 'veo3':
78
- test_video_path = result_list[index]
79
- else:
80
- test_video_path = path.join(result_list[index], "result.mp4")
81
-
82
- if path.exists(test_video_path) is False: continue
83
- gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
84
- if path.exists(gt_video_path) is False:
85
- gt_video_path = path.join(groundtruth_list[index], "raw_video.mp4")
86
- if path.exists(gt_video_path) is False: continue
87
- result = eval_gemini(gt_video_path, test_video_path)
88
- if result is None: continue
89
-
90
- pat = r"\[(?:A|B)\]"
91
- m = re.findall(pat, result, flags=re.I)
92
- score = 0
93
- if m[0][1] == "B": score += 1
94
-
95
- result = eval_gemini(test_video_path, gt_video_path)
96
- if result is None: continue
97
-
98
- pat = r"\[(?:A|B)\]"
99
- m = re.findall(pat, result, flags=re.I)
100
- if m[0][1] == "A": score += 1
101
-
102
- arena_score_list.append({
103
- "data_idx": index,
104
- "score": score/2
105
- })
106
- with open(save_path, 'w') as f: json.dump(arena_score_list, f, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/PresentQuiz.py DELETED
@@ -1,264 +0,0 @@
1
- import random
2
- import string
3
- import yaml
4
- import PIL
5
- import tempfile
6
- import io
7
- import argparse
8
- from os import path
9
- from camel.models import ModelFactory
10
- from math import ceil
11
- from openai import OpenAI
12
- from camel.messages import BaseMessage
13
- from utils.src.model_utils import parse_pdf
14
- from urllib.parse import unquote
15
- from copy import deepcopy
16
- from transformers import AutoTokenizer, AutoModelForCausalLM
17
- from pytorch_fid.fid_score import compute_statistics_of_path
18
- import pytorch_fid.fid_score as fid
19
- from PIL import Image
20
- from httpx import Timeout
21
- from docling.document_converter import DocumentConverter, PdfFormatOption
22
- import re
23
- import shutil
24
- import pytesseract
25
- from utils.wei_utils import account_token
26
- from camel.types import ModelPlatformType, ModelType
27
- from marker.models import create_model_dict
28
- from camel.configs import ChatGPTConfig
29
- from camel.agents import ChatAgent
30
- from jinja2 import Environment, StrictUndefined
31
- from utils.src.utils import get_json_from_response
32
- from pathlib import Path
33
- from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
34
- from collections import defaultdict
35
- from camel.configs import ChatGPTConfig, QwenConfig, VLLMConfig, OpenRouterConfig, GeminiConfig
36
-
37
- from docling.datamodel.base_models import InputFormat
38
- from docling.datamodel.pipeline_options import PdfPipelineOptions
39
- from docling.document_converter import DocumentConverter, PdfFormatOption
40
-
41
- import math
42
- import base64
43
- import requests
44
- from io import BytesIO
45
- from PIL import Image
46
-
47
- import torch
48
- import json
49
- import os
50
- import pickle as pkl
51
- import numpy as np
52
- from transformers import AltCLIPProcessor, AltCLIPModel
53
- from pathlib import Path
54
- from typing import List
55
- from moviepy.editor import VideoFileClip
56
-
57
-
58
- os.environ["GEMINI_API_KEY"] = ""
59
-
60
- def compute_accuracy(predicted, ground_truth, aspects):
61
- """
62
- Parameters
63
- ----------
64
- predicted : dict
65
- {question: {'answer': <letter>, 'reference': ...}, ...}
66
- ground_truth : dict
67
- {question: '<letter>. full answer', ...}
68
- aspects : dict
69
- {question: '<aspect name>', ...}
70
-
71
- Returns
72
- -------
73
- overall_accuracy : float
74
- aspect_summary : dict
75
- {
76
- '<aspect name>': {
77
- 'total': <int>, # questions in this aspect
78
- 'correct': <int>, # correctly answered questions
79
- 'accuracy': <float> # correct / total (0–1)
80
- },
81
- ...
82
- }
83
- """
84
- correct_global = 0
85
- total_global = len(ground_truth)
86
-
87
- total_by_aspect = defaultdict(int)
88
- correct_by_aspect = defaultdict(int)
89
-
90
- for q, pred_info in predicted.items():
91
- letter_pred = pred_info['answer']
92
- aspect = aspects.get(q, 'Unknown')
93
- total_by_aspect[aspect] += 1
94
-
95
- if q in ground_truth:
96
- letter_gt = ground_truth[q].split('.')[0].strip()
97
-
98
- if len(letter_pred) > 0:
99
- letter_pred = letter_pred[0].upper()
100
- if letter_pred == letter_gt:
101
- correct_global += 1
102
- correct_by_aspect[aspect] += 1
103
-
104
- overall_accuracy = correct_global / total_global if total_global else 0.0
105
-
106
- # Build the per-aspect dictionary
107
- aspect_summary = {}
108
- for aspect, total in total_by_aspect.items():
109
- correct = correct_by_aspect[aspect]
110
- acc = correct / total if total else 0.0
111
- aspect_summary[aspect] = {
112
- 'total': total,
113
- 'correct': correct,
114
- 'accuracy': acc
115
- }
116
-
117
- return overall_accuracy, aspect_summary
118
-
119
- def eval_qa_get_answer(video_input, questions, answers, aspects, agent_config, input_type='video'):
120
- agent_name = f'answer_question_from_{input_type}'
121
- with open(f"prompt/{agent_name}.yaml", "r") as f: config = yaml.safe_load(f)
122
-
123
- actor_model = ModelFactory.create(
124
- model_platform=agent_config['model_platform'],
125
- model_type=agent_config['model_type'],
126
- model_config_dict=agent_config['model_config'],)
127
-
128
- actor_sys_msg = config['system_prompt']
129
-
130
- actor_agent = ChatAgent(system_message=actor_sys_msg, model=actor_model, message_window_size=None,)
131
- actor_agent.reset()
132
-
133
- jinja_env = Environment(undefined=StrictUndefined)
134
- template = jinja_env.from_string(config["template"])
135
- with open(video_input, "rb") as f: video_bytes = f.read()
136
- if input_type == 'video':
137
- prompt = template.render(**{'questions': questions,})
138
-
139
- clip = VideoFileClip(video_input)
140
- duration = clip.duration
141
- msg = BaseMessage.make_user_message(
142
- role_name="User",
143
- content=prompt+"The video length is {}, you should NOT reference the timesteps if it exceeds video length".format(str(duration)),
144
- video_bytes=video_bytes,
145
- video_detail="low")
146
- response = actor_agent.step(msg)
147
- agent_answers = get_json_from_response(response.msgs[0].content)
148
-
149
- input_token, output_token = account_token(response)
150
- accuracy, aspect_accuracy = compute_accuracy(agent_answers, answers, aspects)
151
- return accuracy, aspect_accuracy, agent_answers, input_token, output_token
152
-
153
- def run_qa_metric(question_path, video_path, result_path, test_model):
154
- if test_model == "gemini":
155
- agent_config = {
156
- "model_type": ModelType.GEMINI_2_5_FLASH,
157
- "model_config": GeminiConfig().as_dict(),
158
- "model_platform": ModelPlatformType.GEMINI,
159
- }
160
- overall_qa_result = {"qa_result": {}}
161
-
162
- qa_dict = json.load(open(question_path, 'r'))
163
- detail_qa, understanding_qa = qa_dict['detail'], qa_dict['understanding']
164
- input_token_all, output_token_all =0, 0
165
- detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
166
- video_input=video_path,
167
- questions=detail_qa['questions'],
168
- answers=detail_qa['answers'],
169
- aspects=detail_qa['aspects'],
170
- agent_config=agent_config,
171
- input_type='video')
172
- input_token_all += input_token
173
- output_token_all += output_token
174
- understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
175
- video_input=video_path,
176
- questions=understanding_qa['questions'],
177
- answers=understanding_qa['answers'],
178
- aspects=understanding_qa['aspects'],
179
- agent_config=agent_config,
180
- input_type='video')
181
- input_token_all += input_token
182
- output_token_all += output_token
183
- overall_qa_result['qa_result'][test_model] = {
184
- 'detail_accuracy': detail_accuracy,
185
- 'detail_aspect_accuracy': detail_aspect_accuracy,
186
- 'detail_agent_answers': detail_agent_answers,
187
- 'understanding_accuracy': understanding_accuracy,
188
- 'understanding_aspect_accuracy': understanding_aspect_accuracy,
189
- 'understanding_agent_answers': understanding_agent_answers}
190
- all_models_in_file = list(overall_qa_result['qa_result'].keys())
191
- detail_accs = []
192
- understanding_accs = []
193
- for m in all_models_in_file:
194
- detail_accs.append(overall_qa_result['qa_result'][m]['detail_accuracy'])
195
- understanding_accs.append(overall_qa_result['qa_result'][m]['understanding_accuracy'])
196
-
197
- avg_detail_accuracy = float(np.mean(detail_accs)) if detail_accs else 0.0
198
- avg_understanding_accuracy = float(np.mean(understanding_accs)) if understanding_accs else 0.0
199
-
200
- overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
201
- overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy
202
-
203
- # Finally, overwrite the same JSON file with the updated results
204
- with open(result_path, 'w') as f: json.dump(overall_qa_result, f, indent=4)
205
- print(detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token)
206
-
207
- _num_at_start = re.compile(r'^\s*["\']?(\d+)')
208
- def sort_by_leading_number(paths: List[str]) -> List[str]:
209
- def key(p: str):
210
- name = Path(p).name
211
- m = _num_at_start.match(name)
212
- return (int(m.group(1)) if m else float('inf'), name)
213
- return sorted(paths, key=key)
214
-
215
- if __name__ == "__main__":
216
- parser = argparse.ArgumentParser()
217
- parser.add_argument("-r", "--result_dir", default="/path/to/result")
218
- parser.add_argument("-g", "--data_dir", default="/path/to/data")
219
- parser.add_argument("-s", "--save_dir", default="/path/to/data")
220
- args = parser.parse_args()
221
- ## mkdirs
222
- save_dir = args.save_dir
223
- if path.basename(args.result_dir) == "paper2video":
224
- save_dir = path.join(save_dir, path.basename(args.result_dir))
225
- else: save_dir = path.join(save_dir, path.basename(args.result_dir))
226
-
227
- save_path = path.join(save_dir, "qa_result")
228
- os.makedirs(save_dir, exist_ok=True)
229
- os.makedirs(save_path, exist_ok=True)
230
-
231
- ## run test
232
- gt_dir, result_dir = args.data_dir, args.result_dir
233
- groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
234
- if path.basename(args.result_dir) == "human_made": result_list = [] # from dataset
235
- else: result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
236
-
237
- start, end = 1, 100
238
- for index in range(start, end):
239
- qa_json_path = path.join(groundtruth_list[index], "4o-mini_qa.json")
240
-
241
- ## paper2video
242
- if path.basename(args.result_dir) == 'paper2video':
243
- if without_presenter_flag is False:
244
- test_video_path = path.join(result_list[index], "3_merage.mp4")
245
- else:
246
- test_video_path = path.join(result_list[index], "1_merage.mp4")
247
- if path.exists(test_video_path) is False: continue
248
- ## human made as baseline
249
- elif path.basename(args.result_dir) == 'human_made':
250
- test_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
251
- if path.exists(test_video_path) is False:
252
- test_video_path = path.join(groundtruth_list[index], "raw_video.mp4")
253
- ## veo3
254
- elif path.basename(args.result_dir) == 'veo3':
255
- test_video_path = result_list[index]
256
- elif path.basename(args.result_dir) == 'wan2.1':
257
- test_video_path = path.join(result_list[index], "result.mp4")
258
- ## presentagent
259
- else:
260
- test_video_path = path.join(result_list[index], "result.mp4")
261
- if path.exists(test_video_path) is False: continue
262
- result_save_path = path.join(save_path, "qa_result_{}.json".format(index))
263
- print("start")
264
- run_qa_metric(qa_json_path, test_video_path, result_save_path, 'gemini')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/create_paper_questions.py DELETED
@@ -1,47 +0,0 @@
1
- from utils.poster_eval_utils import *
2
- import argparse
3
- import os
4
- import json
5
-
6
-
7
- os.environ["OPENAI_API_KEY"] = ""
8
-
9
-
10
- if __name__ == '__main__':
11
- parser = argparse.ArgumentParser()
12
- parser.add_argument('--paper_folder', type=str, default="path/to/data")
13
- parser.add_argument('--model_name', type=str, default='4o')
14
- args = parser.parse_args()
15
-
16
- paper_text = get_poster_text(os.path.join(args.paper_folder, 'pdf', 'paper.pdf'))
17
-
18
- if args.model_name == '4o':
19
- model_type = ModelType.GPT_4O
20
- elif args.model_name == 'o3':
21
- model_type = ModelType.O3
22
- elif args.model_name == 'gemini':
23
- model_type = ModelType.GEMINI_2_5_PRO
24
-
25
- detail_qa = get_questions(paper_text, 'detail', model_type)
26
- understanding_qa = get_questions(paper_text, 'understanding', model_type)
27
-
28
- detail_q, detail_a, detail_aspects = get_answers_and_remove_answers(detail_qa)
29
- understanding_q, understanding_a, understanding_aspects = get_answers_and_remove_answers(understanding_qa)
30
-
31
- final_qa = {}
32
- detail_qa = {
33
- 'questions': detail_q,
34
- 'answers': detail_a,
35
- 'aspects': detail_aspects,
36
- }
37
-
38
- understanding_qa = {
39
- 'questions': understanding_q,
40
- 'answers': understanding_a,
41
- 'aspects': understanding_aspects,
42
- }
43
- final_qa['detail'] = detail_qa
44
- final_qa['understanding'] = understanding_qa
45
-
46
- with open(os.path.join(args.paper_folder, f'{args.model_name}_qa.json'), 'w') as f:
47
- json.dump(final_qa, f, indent=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/backend/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/backend/abstract_backend.py DELETED
@@ -1,63 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Set, Union
5
-
6
- from docling_core.types.doc import DoclingDocument
7
-
8
- if TYPE_CHECKING:
9
- from docling.datamodel.base_models import InputFormat
10
- from docling.datamodel.document import InputDocument
11
-
12
-
13
- class AbstractDocumentBackend(ABC):
14
- @abstractmethod
15
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
- self.file = in_doc.file
17
- self.path_or_stream = path_or_stream
18
- self.document_hash = in_doc.document_hash
19
- self.input_format = in_doc.format
20
-
21
- @abstractmethod
22
- def is_valid(self) -> bool:
23
- pass
24
-
25
- @classmethod
26
- @abstractmethod
27
- def supports_pagination(cls) -> bool:
28
- pass
29
-
30
- def unload(self):
31
- if isinstance(self.path_or_stream, BytesIO):
32
- self.path_or_stream.close()
33
-
34
- self.path_or_stream = None
35
-
36
- @classmethod
37
- @abstractmethod
38
- def supported_formats(cls) -> Set["InputFormat"]:
39
- pass
40
-
41
-
42
- class PaginatedDocumentBackend(AbstractDocumentBackend):
43
- """DeclarativeDocumentBackend.
44
-
45
- A declarative document backend is a backend that can transform to DoclingDocument
46
- straight without a recognition pipeline.
47
- """
48
-
49
- @abstractmethod
50
- def page_count(self) -> int:
51
- pass
52
-
53
-
54
- class DeclarativeDocumentBackend(AbstractDocumentBackend):
55
- """DeclarativeDocumentBackend.
56
-
57
- A declarative document backend is a backend that can transform to DoclingDocument
58
- straight without a recognition pipeline.
59
- """
60
-
61
- @abstractmethod
62
- def convert(self) -> DoclingDocument:
63
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/asciidoc_backend.py DELETED
@@ -1,430 +0,0 @@
1
- import logging
2
- import re
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import Set, Union
6
-
7
- from docling_core.types.doc import (
8
- DocItemLabel,
9
- DoclingDocument,
10
- DocumentOrigin,
11
- GroupItem,
12
- GroupLabel,
13
- ImageRef,
14
- Size,
15
- TableCell,
16
- TableData,
17
- )
18
-
19
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
- from docling.datamodel.base_models import InputFormat
21
- from docling.datamodel.document import InputDocument
22
-
23
- _log = logging.getLogger(__name__)
24
-
25
-
26
- class AsciiDocBackend(DeclarativeDocumentBackend):
27
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
28
- super().__init__(in_doc, path_or_stream)
29
-
30
- self.path_or_stream = path_or_stream
31
-
32
- try:
33
- if isinstance(self.path_or_stream, BytesIO):
34
- text_stream = self.path_or_stream.getvalue().decode("utf-8")
35
- self.lines = text_stream.split("\n")
36
- if isinstance(self.path_or_stream, Path):
37
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
38
- self.lines = f.readlines()
39
- self.valid = True
40
-
41
- except Exception as e:
42
- raise RuntimeError(
43
- f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
44
- ) from e
45
- return
46
-
47
- def is_valid(self) -> bool:
48
- return self.valid
49
-
50
- @classmethod
51
- def supports_pagination(cls) -> bool:
52
- return False
53
-
54
- def unload(self):
55
- return
56
-
57
- @classmethod
58
- def supported_formats(cls) -> Set[InputFormat]:
59
- return {InputFormat.ASCIIDOC}
60
-
61
- def convert(self) -> DoclingDocument:
62
- """
63
- Parses the ASCII into a structured document model.
64
- """
65
-
66
- origin = DocumentOrigin(
67
- filename=self.file.name or "file",
68
- mimetype="text/asciidoc",
69
- binary_hash=self.document_hash,
70
- )
71
-
72
- doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
73
-
74
- doc = self._parse(doc)
75
-
76
- return doc
77
-
78
- def _parse(self, doc: DoclingDocument):
79
- """
80
- Main function that orchestrates the parsing by yielding components:
81
- title, section headers, text, lists, and tables.
82
- """
83
-
84
- content = ""
85
-
86
- in_list = False
87
- in_table = False
88
-
89
- text_data: list[str] = []
90
- table_data: list[str] = []
91
- caption_data: list[str] = []
92
-
93
- # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
94
- parents: dict[int, Union[GroupItem, None]] = {}
95
- # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
96
- indents: dict[int, Union[GroupItem, None]] = {}
97
-
98
- for i in range(0, 10):
99
- parents[i] = None
100
- indents[i] = None
101
-
102
- for line in self.lines:
103
- # line = line.strip()
104
-
105
- # Title
106
- if self._is_title(line):
107
- item = self._parse_title(line)
108
- level = item["level"]
109
-
110
- parents[level] = doc.add_text(
111
- text=item["text"], label=DocItemLabel.TITLE
112
- )
113
-
114
- # Section headers
115
- elif self._is_section_header(line):
116
- item = self._parse_section_header(line)
117
- level = item["level"]
118
-
119
- parents[level] = doc.add_heading(
120
- text=item["text"], level=item["level"], parent=parents[level - 1]
121
- )
122
- for k, v in parents.items():
123
- if k > level:
124
- parents[k] = None
125
-
126
- # Lists
127
- elif self._is_list_item(line):
128
-
129
- _log.debug(f"line: {line}")
130
- item = self._parse_list_item(line)
131
- _log.debug(f"parsed list-item: {item}")
132
-
133
- level = self._get_current_level(parents)
134
-
135
- if not in_list:
136
- in_list = True
137
-
138
- parents[level + 1] = doc.add_group(
139
- parent=parents[level], name="list", label=GroupLabel.LIST
140
- )
141
- indents[level + 1] = item["indent"]
142
-
143
- elif in_list and item["indent"] > indents[level]:
144
- parents[level + 1] = doc.add_group(
145
- parent=parents[level], name="list", label=GroupLabel.LIST
146
- )
147
- indents[level + 1] = item["indent"]
148
-
149
- elif in_list and item["indent"] < indents[level]:
150
-
151
- # print(item["indent"], " => ", indents[level])
152
- while item["indent"] < indents[level]:
153
- # print(item["indent"], " => ", indents[level])
154
- parents[level] = None
155
- indents[level] = None
156
- level -= 1
157
-
158
- doc.add_list_item(
159
- item["text"], parent=self._get_current_parent(parents)
160
- )
161
-
162
- elif in_list and not self._is_list_item(line):
163
- in_list = False
164
-
165
- level = self._get_current_level(parents)
166
- parents[level] = None
167
-
168
- # Tables
169
- elif line.strip() == "|===" and not in_table: # start of table
170
- in_table = True
171
-
172
- elif self._is_table_line(line): # within a table
173
- in_table = True
174
- table_data.append(self._parse_table_line(line))
175
-
176
- elif in_table and (
177
- (not self._is_table_line(line)) or line.strip() == "|==="
178
- ): # end of table
179
-
180
- caption = None
181
- if len(caption_data) > 0:
182
- caption = doc.add_text(
183
- text=" ".join(caption_data), label=DocItemLabel.CAPTION
184
- )
185
-
186
- caption_data = []
187
-
188
- data = self._populate_table_as_grid(table_data)
189
- doc.add_table(
190
- data=data, parent=self._get_current_parent(parents), caption=caption
191
- )
192
-
193
- in_table = False
194
- table_data = []
195
-
196
- # Picture
197
- elif self._is_picture(line):
198
-
199
- caption = None
200
- if len(caption_data) > 0:
201
- caption = doc.add_text(
202
- text=" ".join(caption_data), label=DocItemLabel.CAPTION
203
- )
204
-
205
- caption_data = []
206
-
207
- item = self._parse_picture(line)
208
-
209
- size = None
210
- if "width" in item and "height" in item:
211
- size = Size(width=int(item["width"]), height=int(item["height"]))
212
-
213
- uri = None
214
- if (
215
- "uri" in item
216
- and not item["uri"].startswith("http")
217
- and item["uri"].startswith("//")
218
- ):
219
- uri = "file:" + item["uri"]
220
- elif (
221
- "uri" in item
222
- and not item["uri"].startswith("http")
223
- and item["uri"].startswith("/")
224
- ):
225
- uri = "file:/" + item["uri"]
226
- elif "uri" in item and not item["uri"].startswith("http"):
227
- uri = "file://" + item["uri"]
228
-
229
- image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
230
- doc.add_picture(image=image, caption=caption)
231
-
232
- # Caption
233
- elif self._is_caption(line) and len(caption_data) == 0:
234
- item = self._parse_caption(line)
235
- caption_data.append(item["text"])
236
-
237
- elif (
238
- len(line.strip()) > 0 and len(caption_data) > 0
239
- ): # allow multiline captions
240
- item = self._parse_text(line)
241
- caption_data.append(item["text"])
242
-
243
- # Plain text
244
- elif len(line.strip()) == 0 and len(text_data) > 0:
245
- doc.add_text(
246
- text=" ".join(text_data),
247
- label=DocItemLabel.PARAGRAPH,
248
- parent=self._get_current_parent(parents),
249
- )
250
- text_data = []
251
-
252
- elif len(line.strip()) > 0: # allow multiline texts
253
-
254
- item = self._parse_text(line)
255
- text_data.append(item["text"])
256
-
257
- if len(text_data) > 0:
258
- doc.add_text(
259
- text=" ".join(text_data),
260
- label=DocItemLabel.PARAGRAPH,
261
- parent=self._get_current_parent(parents),
262
- )
263
- text_data = []
264
-
265
- if in_table and len(table_data) > 0:
266
- data = self._populate_table_as_grid(table_data)
267
- doc.add_table(data=data, parent=self._get_current_parent(parents))
268
-
269
- in_table = False
270
- table_data = []
271
-
272
- return doc
273
-
274
- def _get_current_level(self, parents):
275
- for k, v in parents.items():
276
- if v == None and k > 0:
277
- return k - 1
278
-
279
- return 0
280
-
281
- def _get_current_parent(self, parents):
282
- for k, v in parents.items():
283
- if v == None and k > 0:
284
- return parents[k - 1]
285
-
286
- return None
287
-
288
- # ========= Title
289
- def _is_title(self, line):
290
- return re.match(r"^= ", line)
291
-
292
- def _parse_title(self, line):
293
- return {"type": "title", "text": line[2:].strip(), "level": 0}
294
-
295
- # ========= Section headers
296
- def _is_section_header(self, line):
297
- return re.match(r"^==+", line)
298
-
299
- def _parse_section_header(self, line):
300
- match = re.match(r"^(=+)\s+(.*)", line)
301
-
302
- marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
303
- text = match.group(2) # The actual text of the list item
304
-
305
- header_level = marker.count("=") # number of '=' represents level
306
- return {
307
- "type": "header",
308
- "level": header_level - 1,
309
- "text": text.strip(),
310
- }
311
-
312
- # ========= Lists
313
- def _is_list_item(self, line):
314
- return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
315
-
316
- def _parse_list_item(self, line):
317
- """Extract the item marker (number or bullet symbol) and the text of the item."""
318
-
319
- match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
320
- if match:
321
- indent = match.group(1)
322
- marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
323
- text = match.group(3) # The actual text of the list item
324
-
325
- if marker == "*" or marker == "-":
326
- return {
327
- "type": "list_item",
328
- "marker": marker,
329
- "text": text.strip(),
330
- "numbered": False,
331
- "indent": 0 if indent == None else len(indent),
332
- }
333
- else:
334
- return {
335
- "type": "list_item",
336
- "marker": marker,
337
- "text": text.strip(),
338
- "numbered": True,
339
- "indent": 0 if indent == None else len(indent),
340
- }
341
- else:
342
- # Fallback if no match
343
- return {
344
- "type": "list_item",
345
- "marker": "-",
346
- "text": line,
347
- "numbered": False,
348
- "indent": 0,
349
- }
350
-
351
- # ========= Tables
352
- def _is_table_line(self, line):
353
- return re.match(r"^\|.*\|", line)
354
-
355
- def _parse_table_line(self, line):
356
- # Split table cells and trim extra spaces
357
- return [cell.strip() for cell in line.split("|") if cell.strip()]
358
-
359
- def _populate_table_as_grid(self, table_data):
360
-
361
- num_rows = len(table_data)
362
-
363
- # Adjust the table data into a grid format
364
- num_cols = max(len(row) for row in table_data)
365
-
366
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
367
- for row_idx, row in enumerate(table_data):
368
- # Pad rows with empty strings to match column count
369
- # grid.append(row + [''] * (max_cols - len(row)))
370
-
371
- for col_idx, text in enumerate(row):
372
- row_span = 1
373
- col_span = 1
374
-
375
- cell = TableCell(
376
- text=text,
377
- row_span=row_span,
378
- col_span=col_span,
379
- start_row_offset_idx=row_idx,
380
- end_row_offset_idx=row_idx + row_span,
381
- start_col_offset_idx=col_idx,
382
- end_col_offset_idx=col_idx + col_span,
383
- col_header=False,
384
- row_header=False,
385
- )
386
- data.table_cells.append(cell)
387
-
388
- return data
389
-
390
- # ========= Pictures
391
- def _is_picture(self, line):
392
- return re.match(r"^image::", line)
393
-
394
- def _parse_picture(self, line):
395
- """
396
- Parse an image macro, extracting its path and attributes.
397
- Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
398
- """
399
- mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
400
- if mtch:
401
- picture_path = mtch.group(1).strip()
402
- attributes = mtch.group(2).split(",")
403
- picture_info = {"type": "picture", "uri": picture_path}
404
-
405
- # Extract optional attributes (alt text, width, height, alignment)
406
- if attributes:
407
- picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
408
- for attr in attributes[1:]:
409
- key, value = attr.split("=")
410
- picture_info[key.strip()] = value.strip()
411
-
412
- return picture_info
413
-
414
- return {"type": "picture", "uri": line}
415
-
416
- # ========= Captions
417
- def _is_caption(self, line):
418
- return re.match(r"^\.(.+)", line)
419
-
420
- def _parse_caption(self, line):
421
- mtch = re.match(r"^\.(.+)", line)
422
- if mtch:
423
- text = mtch.group(1)
424
- return {"type": "caption", "text": text}
425
-
426
- return {"type": "caption", "text": ""}
427
-
428
- # ========= Plain text
429
- def _parse_text(self, line):
430
- return {"type": "text", "text": line.strip()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_backend.py DELETED
@@ -1,227 +0,0 @@
1
- import logging
2
- import random
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
6
-
7
- import pypdfium2 as pdfium
8
- from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
- from docling_parse.pdf_parsers import pdf_parser_v1
10
- from PIL import Image, ImageDraw
11
- from pypdfium2 import PdfPage
12
-
13
- from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import Cell
15
- from docling.datamodel.document import InputDocument
16
-
17
- _log = logging.getLogger(__name__)
18
-
19
-
20
- class DoclingParsePageBackend(PdfPageBackend):
21
- def __init__(
22
- self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
23
- ):
24
- self._ppage = page_obj
25
- parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
26
-
27
- self.valid = "pages" in parsed_page
28
- if self.valid:
29
- self._dpage = parsed_page["pages"][0]
30
- else:
31
- _log.info(
32
- f"An error occurred when loading page {page_no} of document {document_hash}."
33
- )
34
-
35
- def is_valid(self) -> bool:
36
- return self.valid
37
-
38
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
39
- if not self.valid:
40
- return ""
41
- # Find intersecting cells on the page
42
- text_piece = ""
43
- page_size = self.get_size()
44
- parser_width = self._dpage["width"]
45
- parser_height = self._dpage["height"]
46
-
47
- scale = (
48
- 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
49
- )
50
-
51
- for i in range(len(self._dpage["cells"])):
52
- rect = self._dpage["cells"][i]["box"]["device"]
53
- x0, y0, x1, y1 = rect
54
- cell_bbox = BoundingBox(
55
- l=x0 * scale * page_size.width / parser_width,
56
- b=y0 * scale * page_size.height / parser_height,
57
- r=x1 * scale * page_size.width / parser_width,
58
- t=y1 * scale * page_size.height / parser_height,
59
- coord_origin=CoordOrigin.BOTTOMLEFT,
60
- ).to_top_left_origin(page_height=page_size.height * scale)
61
-
62
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
63
-
64
- if overlap_frac > 0.5:
65
- if len(text_piece) > 0:
66
- text_piece += " "
67
- text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
68
-
69
- return text_piece
70
-
71
- def get_text_cells(self) -> Iterable[Cell]:
72
- cells: List[Cell] = []
73
- cell_counter = 0
74
-
75
- if not self.valid:
76
- return cells
77
-
78
- page_size = self.get_size()
79
-
80
- parser_width = self._dpage["width"]
81
- parser_height = self._dpage["height"]
82
-
83
- for i in range(len(self._dpage["cells"])):
84
- rect = self._dpage["cells"][i]["box"]["device"]
85
- x0, y0, x1, y1 = rect
86
-
87
- if x1 < x0:
88
- x0, x1 = x1, x0
89
- if y1 < y0:
90
- y0, y1 = y1, y0
91
-
92
- text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
93
- cells.append(
94
- Cell(
95
- id=cell_counter,
96
- text=text_piece,
97
- bbox=BoundingBox(
98
- # l=x0, b=y0, r=x1, t=y1,
99
- l=x0 * page_size.width / parser_width,
100
- b=y0 * page_size.height / parser_height,
101
- r=x1 * page_size.width / parser_width,
102
- t=y1 * page_size.height / parser_height,
103
- coord_origin=CoordOrigin.BOTTOMLEFT,
104
- ).to_top_left_origin(page_size.height),
105
- )
106
- )
107
- cell_counter += 1
108
-
109
- def draw_clusters_and_cells():
110
- image = (
111
- self.get_page_image()
112
- ) # make new image to avoid drawing on the saved ones
113
- draw = ImageDraw.Draw(image)
114
- for c in cells:
115
- x0, y0, x1, y1 = c.bbox.as_tuple()
116
- cell_color = (
117
- random.randint(30, 140),
118
- random.randint(30, 140),
119
- random.randint(30, 140),
120
- )
121
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
122
- image.show()
123
-
124
- # before merge:
125
- # draw_clusters_and_cells()
126
-
127
- # cells = merge_horizontal_cells(cells)
128
-
129
- # after merge:
130
- # draw_clusters_and_cells()
131
-
132
- return cells
133
-
134
- def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
135
- AREA_THRESHOLD = 0 # 32 * 32
136
-
137
- for i in range(len(self._dpage["images"])):
138
- bitmap = self._dpage["images"][i]
139
- cropbox = BoundingBox.from_tuple(
140
- bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
141
- ).to_top_left_origin(self.get_size().height)
142
-
143
- if cropbox.area() > AREA_THRESHOLD:
144
- cropbox = cropbox.scaled(scale=scale)
145
-
146
- yield cropbox
147
-
148
- def get_page_image(
149
- self, scale: float = 1, cropbox: Optional[BoundingBox] = None
150
- ) -> Image.Image:
151
-
152
- page_size = self.get_size()
153
-
154
- if not cropbox:
155
- cropbox = BoundingBox(
156
- l=0,
157
- r=page_size.width,
158
- t=0,
159
- b=page_size.height,
160
- coord_origin=CoordOrigin.TOPLEFT,
161
- )
162
- padbox = BoundingBox(
163
- l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
164
- )
165
- else:
166
- padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
167
- padbox.r = page_size.width - padbox.r
168
- padbox.t = page_size.height - padbox.t
169
-
170
- image = (
171
- self._ppage.render(
172
- scale=scale * 1.5,
173
- rotation=0, # no additional rotation
174
- crop=padbox.as_tuple(),
175
- )
176
- .to_pil()
177
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
178
- ) # We resize the image from 1.5x the given scale to make it sharper.
179
-
180
- return image
181
-
182
- def get_size(self) -> Size:
183
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
184
-
185
- def unload(self):
186
- self._ppage = None
187
- self._dpage = None
188
-
189
-
190
- class DoclingParseDocumentBackend(PdfDocumentBackend):
191
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
192
- super().__init__(in_doc, path_or_stream)
193
-
194
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
195
- self.parser = pdf_parser_v1()
196
-
197
- success = False
198
- if isinstance(self.path_or_stream, BytesIO):
199
- success = self.parser.load_document_from_bytesio(
200
- self.document_hash, self.path_or_stream
201
- )
202
- elif isinstance(self.path_or_stream, Path):
203
- success = self.parser.load_document(
204
- self.document_hash, str(self.path_or_stream)
205
- )
206
-
207
- if not success:
208
- raise RuntimeError(
209
- f"docling-parse could not load document with hash {self.document_hash}."
210
- )
211
-
212
- def page_count(self) -> int:
213
- return len(self._pdoc) # To be replaced with docling-parse API
214
-
215
- def load_page(self, page_no: int) -> DoclingParsePageBackend:
216
- return DoclingParsePageBackend(
217
- self.parser, self.document_hash, page_no, self._pdoc[page_no]
218
- )
219
-
220
- def is_valid(self) -> bool:
221
- return self.page_count() > 0
222
-
223
- def unload(self):
224
- super().unload()
225
- self.parser.unload_document(self.document_hash)
226
- self._pdoc.close()
227
- self._pdoc = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_v2_backend.py DELETED
@@ -1,250 +0,0 @@
1
- import logging
2
- import random
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
-
7
- import pypdfium2 as pdfium
8
- from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_parse.pdf_parsers import pdf_parser_v2
10
- from PIL import Image, ImageDraw
11
- from pypdfium2 import PdfPage
12
-
13
- from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import Cell, Size
15
-
16
- if TYPE_CHECKING:
17
- from docling.datamodel.document import InputDocument
18
-
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class DoclingParseV2PageBackend(PdfPageBackend):
23
- def __init__(
24
- self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
25
- ):
26
- self._ppage = page_obj
27
- parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
28
-
29
- self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
30
- if self.valid:
31
- self._dpage = parsed_page["pages"][0]
32
- else:
33
- _log.info(
34
- f"An error occurred when loading page {page_no} of document {document_hash}."
35
- )
36
-
37
- def is_valid(self) -> bool:
38
- return self.valid
39
-
40
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
41
- if not self.valid:
42
- return ""
43
- # Find intersecting cells on the page
44
- text_piece = ""
45
- page_size = self.get_size()
46
-
47
- parser_width = self._dpage["sanitized"]["dimension"]["width"]
48
- parser_height = self._dpage["sanitized"]["dimension"]["height"]
49
-
50
- scale = (
51
- 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
52
- )
53
-
54
- cells_data = self._dpage["sanitized"]["cells"]["data"]
55
- cells_header = self._dpage["sanitized"]["cells"]["header"]
56
-
57
- for i, cell_data in enumerate(cells_data):
58
- x0 = cell_data[cells_header.index("x0")]
59
- y0 = cell_data[cells_header.index("y0")]
60
- x1 = cell_data[cells_header.index("x1")]
61
- y1 = cell_data[cells_header.index("y1")]
62
-
63
- cell_bbox = BoundingBox(
64
- l=x0 * scale * page_size.width / parser_width,
65
- b=y0 * scale * page_size.height / parser_height,
66
- r=x1 * scale * page_size.width / parser_width,
67
- t=y1 * scale * page_size.height / parser_height,
68
- coord_origin=CoordOrigin.BOTTOMLEFT,
69
- ).to_top_left_origin(page_height=page_size.height * scale)
70
-
71
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
72
-
73
- if overlap_frac > 0.5:
74
- if len(text_piece) > 0:
75
- text_piece += " "
76
- text_piece += cell_data[cells_header.index("text")]
77
-
78
- return text_piece
79
-
80
- def get_text_cells(self) -> Iterable[Cell]:
81
- cells: List[Cell] = []
82
- cell_counter = 0
83
-
84
- if not self.valid:
85
- return cells
86
-
87
- page_size = self.get_size()
88
-
89
- parser_width = self._dpage["sanitized"]["dimension"]["width"]
90
- parser_height = self._dpage["sanitized"]["dimension"]["height"]
91
-
92
- cells_data = self._dpage["sanitized"]["cells"]["data"]
93
- cells_header = self._dpage["sanitized"]["cells"]["header"]
94
-
95
- for i, cell_data in enumerate(cells_data):
96
- x0 = cell_data[cells_header.index("x0")]
97
- y0 = cell_data[cells_header.index("y0")]
98
- x1 = cell_data[cells_header.index("x1")]
99
- y1 = cell_data[cells_header.index("y1")]
100
-
101
- if x1 < x0:
102
- x0, x1 = x1, x0
103
- if y1 < y0:
104
- y0, y1 = y1, y0
105
-
106
- text_piece = cell_data[cells_header.index("text")]
107
- cells.append(
108
- Cell(
109
- id=cell_counter,
110
- text=text_piece,
111
- bbox=BoundingBox(
112
- # l=x0, b=y0, r=x1, t=y1,
113
- l=x0 * page_size.width / parser_width,
114
- b=y0 * page_size.height / parser_height,
115
- r=x1 * page_size.width / parser_width,
116
- t=y1 * page_size.height / parser_height,
117
- coord_origin=CoordOrigin.BOTTOMLEFT,
118
- ).to_top_left_origin(page_size.height),
119
- )
120
- )
121
- cell_counter += 1
122
-
123
- def draw_clusters_and_cells():
124
- image = (
125
- self.get_page_image()
126
- ) # make new image to avoid drawing on the saved ones
127
- draw = ImageDraw.Draw(image)
128
- for c in cells:
129
- x0, y0, x1, y1 = c.bbox.as_tuple()
130
- cell_color = (
131
- random.randint(30, 140),
132
- random.randint(30, 140),
133
- random.randint(30, 140),
134
- )
135
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
136
- image.show()
137
-
138
- # draw_clusters_and_cells()
139
-
140
- return cells
141
-
142
- def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
143
- AREA_THRESHOLD = 0 # 32 * 32
144
-
145
- images = self._dpage["sanitized"]["images"]["data"]
146
- images_header = self._dpage["sanitized"]["images"]["header"]
147
-
148
- for row in images:
149
- x0 = row[images_header.index("x0")]
150
- y0 = row[images_header.index("y0")]
151
- x1 = row[images_header.index("x1")]
152
- y1 = row[images_header.index("y1")]
153
-
154
- cropbox = BoundingBox.from_tuple(
155
- (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
156
- ).to_top_left_origin(self.get_size().height)
157
-
158
- if cropbox.area() > AREA_THRESHOLD:
159
- cropbox = cropbox.scaled(scale=scale)
160
-
161
- yield cropbox
162
-
163
- def get_page_image(
164
- self, scale: float = 1, cropbox: Optional[BoundingBox] = None
165
- ) -> Image.Image:
166
-
167
- page_size = self.get_size()
168
-
169
- if not cropbox:
170
- cropbox = BoundingBox(
171
- l=0,
172
- r=page_size.width,
173
- t=0,
174
- b=page_size.height,
175
- coord_origin=CoordOrigin.TOPLEFT,
176
- )
177
- padbox = BoundingBox(
178
- l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
179
- )
180
- else:
181
- padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
182
- padbox.r = page_size.width - padbox.r
183
- padbox.t = page_size.height - padbox.t
184
-
185
- image = (
186
- self._ppage.render(
187
- scale=scale * 1.5,
188
- rotation=0, # no additional rotation
189
- crop=padbox.as_tuple(),
190
- )
191
- .to_pil()
192
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
193
- ) # We resize the image from 1.5x the given scale to make it sharper.
194
-
195
- return image
196
-
197
- def get_size(self) -> Size:
198
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
199
-
200
- def unload(self):
201
- self._ppage = None
202
- self._dpage = None
203
-
204
-
205
- class DoclingParseV2DocumentBackend(PdfDocumentBackend):
206
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
- super().__init__(in_doc, path_or_stream)
208
-
209
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
210
- self.parser = pdf_parser_v2("fatal")
211
-
212
- success = False
213
- if isinstance(self.path_or_stream, BytesIO):
214
- success = self.parser.load_document_from_bytesio(
215
- self.document_hash, self.path_or_stream
216
- )
217
- elif isinstance(self.path_or_stream, Path):
218
- success = self.parser.load_document(
219
- self.document_hash, str(self.path_or_stream)
220
- )
221
-
222
- if not success:
223
- raise RuntimeError(
224
- f"docling-parse v2 could not load document {self.document_hash}."
225
- )
226
-
227
- def page_count(self) -> int:
228
- # return len(self._pdoc) # To be replaced with docling-parse API
229
-
230
- len_1 = len(self._pdoc)
231
- len_2 = self.parser.number_of_pages(self.document_hash)
232
-
233
- if len_1 != len_2:
234
- _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
235
-
236
- return len_2
237
-
238
- def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
239
- return DoclingParseV2PageBackend(
240
- self.parser, self.document_hash, page_no, self._pdoc[page_no]
241
- )
242
-
243
- def is_valid(self) -> bool:
244
- return self.page_count() > 0
245
-
246
- def unload(self):
247
- super().unload()
248
- self.parser.unload_document(self.document_hash)
249
- self._pdoc.close()
250
- self._pdoc = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/html_backend.py DELETED
@@ -1,442 +0,0 @@
1
- import logging
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import Optional, Set, Union
5
-
6
- from bs4 import BeautifulSoup, Tag
7
- from docling_core.types.doc import (
8
- DocItemLabel,
9
- DoclingDocument,
10
- DocumentOrigin,
11
- GroupLabel,
12
- TableCell,
13
- TableData,
14
- )
15
-
16
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
17
- from docling.datamodel.base_models import InputFormat
18
- from docling.datamodel.document import InputDocument
19
-
20
- _log = logging.getLogger(__name__)
21
-
22
-
23
- class HTMLDocumentBackend(DeclarativeDocumentBackend):
24
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
- super().__init__(in_doc, path_or_stream)
26
- _log.debug("About to init HTML backend...")
27
- self.soup: Optional[Tag] = None
28
- # HTML file:
29
- self.path_or_stream = path_or_stream
30
- # Initialise the parents for the hierarchy
31
- self.max_levels = 10
32
- self.level = 0
33
- self.parents = {} # type: ignore
34
- for i in range(0, self.max_levels):
35
- self.parents[i] = None
36
- self.labels = {} # type: ignore
37
-
38
- try:
39
- if isinstance(self.path_or_stream, BytesIO):
40
- text_stream = self.path_or_stream.getvalue()
41
- self.soup = BeautifulSoup(text_stream, "html.parser")
42
- if isinstance(self.path_or_stream, Path):
43
- with open(self.path_or_stream, "rb") as f:
44
- html_content = f.read()
45
- self.soup = BeautifulSoup(html_content, "html.parser")
46
- except Exception as e:
47
- raise RuntimeError(
48
- f"Could not initialize HTML backend for file with hash {self.document_hash}."
49
- ) from e
50
-
51
- def is_valid(self) -> bool:
52
- return self.soup is not None
53
-
54
- @classmethod
55
- def supports_pagination(cls) -> bool:
56
- return False
57
-
58
- def unload(self):
59
- if isinstance(self.path_or_stream, BytesIO):
60
- self.path_or_stream.close()
61
-
62
- self.path_or_stream = None
63
-
64
- @classmethod
65
- def supported_formats(cls) -> Set[InputFormat]:
66
- return {InputFormat.HTML}
67
-
68
- def convert(self) -> DoclingDocument:
69
- # access self.path_or_stream to load stuff
70
- origin = DocumentOrigin(
71
- filename=self.file.name or "file",
72
- mimetype="text/html",
73
- binary_hash=self.document_hash,
74
- )
75
-
76
- doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
77
- _log.debug("Trying to convert HTML...")
78
-
79
- if self.is_valid():
80
- assert self.soup is not None
81
- content = self.soup.body or self.soup
82
- # Replace <br> tags with newline characters
83
- for br in content.find_all("br"):
84
- br.replace_with("\n")
85
- doc = self.walk(content, doc)
86
- else:
87
- raise RuntimeError(
88
- f"Cannot convert doc with {self.document_hash} because the backend failed to init."
89
- )
90
- return doc
91
-
92
- def walk(self, element: Tag, doc: DoclingDocument):
93
- try:
94
- # Iterate over elements in the body of the document
95
- for idx, element in enumerate(element.children):
96
- try:
97
- self.analyse_element(element, idx, doc)
98
- except Exception as exc_child:
99
-
100
- _log.error(" -> error treating child: ", exc_child)
101
- _log.error(" => element: ", element, "\n")
102
- raise exc_child
103
-
104
- except Exception as exc:
105
- pass
106
-
107
- return doc
108
-
109
- def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
110
- """
111
- if element.name!=None:
112
- _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
113
- """
114
-
115
- if element.name in self.labels:
116
- self.labels[element.name] += 1
117
- else:
118
- self.labels[element.name] = 1
119
-
120
- if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
121
- self.handle_header(element, idx, doc)
122
- elif element.name in ["p"]:
123
- self.handle_paragraph(element, idx, doc)
124
- elif element.name in ["pre"]:
125
- self.handle_code(element, idx, doc)
126
- elif element.name in ["ul", "ol"]:
127
- self.handle_list(element, idx, doc)
128
- elif element.name in ["li"]:
129
- self.handle_listitem(element, idx, doc)
130
- elif element.name == "table":
131
- self.handle_table(element, idx, doc)
132
- elif element.name == "figure":
133
- self.handle_figure(element, idx, doc)
134
- elif element.name == "img":
135
- self.handle_image(element, idx, doc)
136
- else:
137
- self.walk(element, doc)
138
-
139
- def get_direct_text(self, item: Tag):
140
- """Get the direct text of the <li> element (ignoring nested lists)."""
141
- text = item.find(string=True, recursive=False)
142
- if isinstance(text, str):
143
- return text.strip()
144
-
145
- return ""
146
-
147
- # Function to recursively extract text from all child nodes
148
- def extract_text_recursively(self, item: Tag):
149
- result = []
150
-
151
- if isinstance(item, str):
152
- return [item]
153
-
154
- if item.name not in ["ul", "ol"]:
155
- try:
156
- # Iterate over the children (and their text and tails)
157
- for child in item:
158
- try:
159
- # Recursively get the child's text content
160
- result.extend(self.extract_text_recursively(child))
161
- except:
162
- pass
163
- except:
164
- _log.warn("item has no children")
165
- pass
166
-
167
- return "".join(result) + " "
168
-
169
- def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
170
- """Handles header tags (h1, h2, etc.)."""
171
- hlevel = int(element.name.replace("h", ""))
172
- slevel = hlevel - 1
173
-
174
- label = DocItemLabel.SECTION_HEADER
175
- text = element.text.strip()
176
-
177
- if hlevel == 1:
178
- for key, val in self.parents.items():
179
- self.parents[key] = None
180
-
181
- self.level = 1
182
- self.parents[self.level] = doc.add_text(
183
- parent=self.parents[0], label=DocItemLabel.TITLE, text=text
184
- )
185
- else:
186
- if hlevel > self.level:
187
-
188
- # add invisible group
189
- for i in range(self.level + 1, hlevel):
190
- self.parents[i] = doc.add_group(
191
- name=f"header-{i}",
192
- label=GroupLabel.SECTION,
193
- parent=self.parents[i - 1],
194
- )
195
- self.level = hlevel
196
-
197
- elif hlevel < self.level:
198
-
199
- # remove the tail
200
- for key, val in self.parents.items():
201
- if key > hlevel:
202
- self.parents[key] = None
203
- self.level = hlevel
204
-
205
- self.parents[hlevel] = doc.add_heading(
206
- parent=self.parents[hlevel - 1],
207
- text=text,
208
- level=hlevel,
209
- )
210
-
211
- def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
212
- """Handles monospace code snippets (pre)."""
213
- if element.text is None:
214
- return
215
- text = element.text.strip()
216
- label = DocItemLabel.CODE
217
- if len(text) == 0:
218
- return
219
- doc.add_code(parent=self.parents[self.level], text=text)
220
-
221
- def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
222
- """Handles paragraph tags (p)."""
223
- if element.text is None:
224
- return
225
- text = element.text.strip()
226
- label = DocItemLabel.PARAGRAPH
227
- if len(text) == 0:
228
- return
229
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
230
-
231
- def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
232
- """Handles list tags (ul, ol) and their list items."""
233
-
234
- if element.name == "ul":
235
- # create a list group
236
- self.parents[self.level + 1] = doc.add_group(
237
- parent=self.parents[self.level], name="list", label=GroupLabel.LIST
238
- )
239
- elif element.name == "ol":
240
- # create a list group
241
- self.parents[self.level + 1] = doc.add_group(
242
- parent=self.parents[self.level],
243
- name="ordered list",
244
- label=GroupLabel.ORDERED_LIST,
245
- )
246
- self.level += 1
247
-
248
- self.walk(element, doc)
249
-
250
- self.parents[self.level + 1] = None
251
- self.level -= 1
252
-
253
- def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
254
- """Handles listitem tags (li)."""
255
- nested_lists = element.find(["ul", "ol"])
256
-
257
- parent_list_label = self.parents[self.level].label
258
- index_in_list = len(self.parents[self.level].children) + 1
259
-
260
- if nested_lists:
261
- name = element.name
262
- # Text in list item can be hidden within hierarchy, hence
263
- # we need to extract it recursively
264
- text = self.extract_text_recursively(element)
265
- # Flatten text, remove break lines:
266
- text = text.replace("\n", "").replace("\r", "")
267
- text = " ".join(text.split()).strip()
268
-
269
- marker = ""
270
- enumerated = False
271
- if parent_list_label == GroupLabel.ORDERED_LIST:
272
- marker = str(index_in_list)
273
- enumerated = True
274
-
275
- if len(text) > 0:
276
- # create a list-item
277
- self.parents[self.level + 1] = doc.add_list_item(
278
- text=text,
279
- enumerated=enumerated,
280
- marker=marker,
281
- parent=self.parents[self.level],
282
- )
283
- self.level += 1
284
-
285
- self.walk(element, doc)
286
-
287
- self.parents[self.level + 1] = None
288
- self.level -= 1
289
-
290
- elif isinstance(element.text, str):
291
- text = element.text.strip()
292
-
293
- marker = ""
294
- enumerated = False
295
- if parent_list_label == GroupLabel.ORDERED_LIST:
296
- marker = f"{str(index_in_list)}."
297
- enumerated = True
298
- doc.add_list_item(
299
- text=text,
300
- enumerated=enumerated,
301
- marker=marker,
302
- parent=self.parents[self.level],
303
- )
304
- else:
305
- _log.warn("list-item has no text: ", element)
306
-
307
- def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
308
- """Handles table tags."""
309
-
310
- nested_tables = element.find("table")
311
- if nested_tables is not None:
312
- _log.warn("detected nested tables: skipping for now")
313
- return
314
-
315
- # Count the number of rows (number of <tr> elements)
316
- num_rows = len(element.find_all("tr"))
317
-
318
- # Find the number of columns (taking into account colspan)
319
- num_cols = 0
320
- for row in element.find_all("tr"):
321
- col_count = 0
322
- for cell in row.find_all(["td", "th"]):
323
- colspan = int(cell.get("colspan", 1))
324
- col_count += colspan
325
- num_cols = max(num_cols, col_count)
326
-
327
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
328
-
329
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
330
-
331
- # Iterate over the rows in the table
332
- for row_idx, row in enumerate(element.find_all("tr")):
333
-
334
- # For each row, find all the column cells (both <td> and <th>)
335
- cells = row.find_all(["td", "th"])
336
-
337
- # Check if each cell in the row is a header -> means it is a column header
338
- col_header = True
339
- for j, html_cell in enumerate(cells):
340
- if html_cell.name == "td":
341
- col_header = False
342
-
343
- col_idx = 0
344
- # Extract and print the text content of each cell
345
- for _, html_cell in enumerate(cells):
346
-
347
- text = html_cell.text
348
- try:
349
- text = self.extract_table_cell_text(html_cell)
350
- except Exception as exc:
351
- _log.warn("exception: ", exc)
352
- exit(-1)
353
-
354
- # label = html_cell.name
355
-
356
- col_span = int(html_cell.get("colspan", 1))
357
- row_span = int(html_cell.get("rowspan", 1))
358
-
359
- while grid[row_idx][col_idx] is not None:
360
- col_idx += 1
361
- for r in range(row_span):
362
- for c in range(col_span):
363
- grid[row_idx + r][col_idx + c] = text
364
-
365
- cell = TableCell(
366
- text=text,
367
- row_span=row_span,
368
- col_span=col_span,
369
- start_row_offset_idx=row_idx,
370
- end_row_offset_idx=row_idx + row_span,
371
- start_col_offset_idx=col_idx,
372
- end_col_offset_idx=col_idx + col_span,
373
- col_header=col_header,
374
- row_header=((not col_header) and html_cell.name == "th"),
375
- )
376
- data.table_cells.append(cell)
377
-
378
- doc.add_table(data=data, parent=self.parents[self.level])
379
-
380
- def get_list_text(self, list_element: Tag, level=0):
381
- """Recursively extract text from <ul> or <ol> with proper indentation."""
382
- result = []
383
- bullet_char = "*" # Default bullet character for unordered lists
384
-
385
- if list_element.name == "ol": # For ordered lists, use numbers
386
- for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
387
- # Add numbering for ordered lists
388
- result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
389
- # Handle nested lists
390
- nested_list = li.find(["ul", "ol"])
391
- if nested_list:
392
- result.extend(self.get_list_text(nested_list, level + 1))
393
- elif list_element.name == "ul": # For unordered lists, use bullet points
394
- for li in list_element.find_all("li", recursive=False):
395
- # Add bullet points for unordered lists
396
- result.append(
397
- f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
398
- )
399
- # Handle nested lists
400
- nested_list = li.find(["ul", "ol"])
401
- if nested_list:
402
- result.extend(self.get_list_text(nested_list, level + 1))
403
-
404
- return result
405
-
406
- def extract_table_cell_text(self, cell: Tag):
407
- """Extract text from a table cell, including lists with indents."""
408
- contains_lists = cell.find(["ul", "ol"])
409
- if contains_lists is None:
410
- return cell.text
411
- else:
412
- _log.debug(
413
- "should extract the content correctly for table-cells with lists ..."
414
- )
415
- return cell.text
416
-
417
- def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
418
- """Handles image tags (img)."""
419
-
420
- # Extract the image URI from the <img> tag
421
- # image_uri = root.xpath('//figure//img/@src')[0]
422
-
423
- contains_captions = element.find(["figcaption"])
424
- if contains_captions is None:
425
- doc.add_picture(parent=self.parents[self.level], caption=None)
426
-
427
- else:
428
- texts = []
429
- for item in contains_captions:
430
- texts.append(item.text)
431
-
432
- fig_caption = doc.add_text(
433
- label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
434
- )
435
- doc.add_picture(
436
- parent=self.parents[self.level],
437
- caption=fig_caption,
438
- )
439
-
440
- def handle_image(self, element: Tag, idx, doc: DoclingDocument):
441
- """Handles image tags (img)."""
442
- doc.add_picture(parent=self.parents[self.level], caption=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/docling_json_backend.py DELETED
@@ -1,58 +0,0 @@
1
- from io import BytesIO
2
- from pathlib import Path
3
- from typing import Union
4
-
5
- from docling_core.types.doc import DoclingDocument
6
- from typing_extensions import override
7
-
8
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
9
- from docling.datamodel.base_models import InputFormat
10
- from docling.datamodel.document import InputDocument
11
-
12
-
13
- class DoclingJSONBackend(DeclarativeDocumentBackend):
14
- @override
15
- def __init__(
16
- self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
17
- ) -> None:
18
- super().__init__(in_doc, path_or_stream)
19
-
20
- # given we need to store any actual conversion exception for raising it from
21
- # convert(), this captures the successful result or the actual error in a
22
- # mutually exclusive way:
23
- self._doc_or_err = self._get_doc_or_err()
24
-
25
- @override
26
- def is_valid(self) -> bool:
27
- return isinstance(self._doc_or_err, DoclingDocument)
28
-
29
- @classmethod
30
- @override
31
- def supports_pagination(cls) -> bool:
32
- return False
33
-
34
- @classmethod
35
- @override
36
- def supported_formats(cls) -> set[InputFormat]:
37
- return {InputFormat.JSON_DOCLING}
38
-
39
- def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
40
- try:
41
- json_data: Union[str, bytes]
42
- if isinstance(self.path_or_stream, Path):
43
- with open(self.path_or_stream, encoding="utf-8") as f:
44
- json_data = f.read()
45
- elif isinstance(self.path_or_stream, BytesIO):
46
- json_data = self.path_or_stream.getvalue()
47
- else:
48
- raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
49
- return DoclingDocument.model_validate_json(json_data=json_data)
50
- except Exception as e:
51
- return e
52
-
53
- @override
54
- def convert(self) -> DoclingDocument:
55
- if isinstance(self._doc_or_err, DoclingDocument):
56
- return self._doc_or_err
57
- else:
58
- raise self._doc_or_err
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/md_backend.py DELETED
@@ -1,428 +0,0 @@
1
- import logging
2
- import re
3
- import warnings
4
- from io import BytesIO
5
- from pathlib import Path
6
- from typing import List, Optional, Set, Union
7
-
8
- import marko
9
- import marko.element
10
- import marko.ext
11
- import marko.ext.gfm
12
- import marko.inline
13
- from docling_core.types.doc import (
14
- DocItem,
15
- DocItemLabel,
16
- DoclingDocument,
17
- DocumentOrigin,
18
- GroupLabel,
19
- NodeItem,
20
- TableCell,
21
- TableData,
22
- TextItem,
23
- )
24
- from marko import Markdown
25
-
26
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
- from docling.backend.html_backend import HTMLDocumentBackend
28
- from docling.datamodel.base_models import InputFormat
29
- from docling.datamodel.document import InputDocument
30
-
31
- _log = logging.getLogger(__name__)
32
-
33
- _MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
34
- _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
- _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
-
37
-
38
- class MarkdownDocumentBackend(DeclarativeDocumentBackend):
39
- def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
40
- # This regex will match any sequence of underscores
41
- pattern = r"_+"
42
-
43
- def replace_match(match):
44
- underscore_sequence = match.group(
45
- 0
46
- ) # Get the full match (sequence of underscores)
47
-
48
- # Shorten the sequence if it exceeds max_length
49
- if len(underscore_sequence) > max_length:
50
- return "_" * max_length
51
- else:
52
- return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
53
-
54
- # Use re.sub to replace long underscore sequences
55
- shortened_text = re.sub(pattern, replace_match, markdown_text)
56
-
57
- if len(shortened_text) != len(markdown_text):
58
- warnings.warn("Detected potentially incorrect Markdown, correcting...")
59
-
60
- return shortened_text
61
-
62
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
63
- super().__init__(in_doc, path_or_stream)
64
-
65
- _log.debug("MD INIT!!!")
66
-
67
- # Markdown file:
68
- self.path_or_stream = path_or_stream
69
- self.valid = True
70
- self.markdown = "" # To store original Markdown string
71
-
72
- self.in_table = False
73
- self.md_table_buffer: list[str] = []
74
- self.inline_texts: list[str] = []
75
- self._html_blocks: int = 0
76
-
77
- try:
78
- if isinstance(self.path_or_stream, BytesIO):
79
- text_stream = self.path_or_stream.getvalue().decode("utf-8")
80
- # remove invalid sequences
81
- # very long sequences of underscores will lead to unnecessary long processing times.
82
- # In any proper Markdown files, underscores have to be escaped,
83
- # otherwise they represent emphasis (bold or italic)
84
- self.markdown = self._shorten_underscore_sequences(text_stream)
85
- if isinstance(self.path_or_stream, Path):
86
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
87
- md_content = f.read()
88
- # remove invalid sequences
89
- # very long sequences of underscores will lead to unnecessary long processing times.
90
- # In any proper Markdown files, underscores have to be escaped,
91
- # otherwise they represent emphasis (bold or italic)
92
- self.markdown = self._shorten_underscore_sequences(md_content)
93
- self.valid = True
94
-
95
- _log.debug(self.markdown)
96
- except Exception as e:
97
- raise RuntimeError(
98
- f"Could not initialize MD backend for file with hash {self.document_hash}."
99
- ) from e
100
- return
101
-
102
- def _close_table(self, doc: DoclingDocument):
103
- if self.in_table:
104
- _log.debug("=== TABLE START ===")
105
- for md_table_row in self.md_table_buffer:
106
- _log.debug(md_table_row)
107
- _log.debug("=== TABLE END ===")
108
- tcells: List[TableCell] = []
109
- result_table = []
110
- for n, md_table_row in enumerate(self.md_table_buffer):
111
- data = []
112
- if n == 0:
113
- header = [t.strip() for t in md_table_row.split("|")[1:-1]]
114
- for value in header:
115
- data.append(value)
116
- result_table.append(data)
117
- if n > 1:
118
- values = [t.strip() for t in md_table_row.split("|")[1:-1]]
119
- for value in values:
120
- data.append(value)
121
- result_table.append(data)
122
-
123
- for trow_ind, trow in enumerate(result_table):
124
- for tcol_ind, cellval in enumerate(trow):
125
- row_span = (
126
- 1 # currently supporting just simple tables (without spans)
127
- )
128
- col_span = (
129
- 1 # currently supporting just simple tables (without spans)
130
- )
131
- icell = TableCell(
132
- text=cellval.strip(),
133
- row_span=row_span,
134
- col_span=col_span,
135
- start_row_offset_idx=trow_ind,
136
- end_row_offset_idx=trow_ind + row_span,
137
- start_col_offset_idx=tcol_ind,
138
- end_col_offset_idx=tcol_ind + col_span,
139
- col_header=False,
140
- row_header=False,
141
- )
142
- tcells.append(icell)
143
-
144
- num_rows = len(result_table)
145
- num_cols = len(result_table[0])
146
- self.in_table = False
147
- self.md_table_buffer = [] # clean table markdown buffer
148
- # Initialize Docling TableData
149
- table_data = TableData(
150
- num_rows=num_rows, num_cols=num_cols, table_cells=tcells
151
- )
152
- # Populate
153
- for tcell in tcells:
154
- table_data.table_cells.append(tcell)
155
- if len(tcells) > 0:
156
- doc.add_table(data=table_data)
157
- return
158
-
159
- def _process_inline_text(
160
- self, parent_item: Optional[NodeItem], doc: DoclingDocument
161
- ):
162
- txt = " ".join(self.inline_texts)
163
- if len(txt) > 0:
164
- doc.add_text(
165
- label=DocItemLabel.PARAGRAPH,
166
- parent=parent_item,
167
- text=txt,
168
- )
169
- self.inline_texts = []
170
-
171
- def _iterate_elements(
172
- self,
173
- element: marko.element.Element,
174
- depth: int,
175
- doc: DoclingDocument,
176
- visited: Set[marko.element.Element],
177
- parent_item: Optional[NodeItem] = None,
178
- ):
179
-
180
- if element in visited:
181
- return
182
-
183
- # Iterates over all elements in the AST
184
- # Check for different element types and process relevant details
185
- if isinstance(element, marko.block.Heading) and len(element.children) > 0:
186
- self._close_table(doc)
187
- self._process_inline_text(parent_item, doc)
188
- _log.debug(
189
- f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
190
- )
191
- if element.level == 1:
192
- doc_label = DocItemLabel.TITLE
193
- else:
194
- doc_label = DocItemLabel.SECTION_HEADER
195
-
196
- # Header could have arbitrary inclusion of bold, italic or emphasis,
197
- # hence we need to traverse the tree to get full text of a header
198
- strings: List[str] = []
199
-
200
- # Define a recursive function to traverse the tree
201
- def traverse(node: marko.block.BlockElement):
202
- # Check if the node has a "children" attribute
203
- if hasattr(node, "children"):
204
- # If "children" is a list, continue traversal
205
- if isinstance(node.children, list):
206
- for child in node.children:
207
- traverse(child)
208
- # If "children" is text, add it to header text
209
- elif isinstance(node.children, str):
210
- strings.append(node.children)
211
-
212
- traverse(element)
213
- snippet_text = "".join(strings)
214
- if len(snippet_text) > 0:
215
- parent_item = doc.add_text(
216
- label=doc_label, parent=parent_item, text=snippet_text
217
- )
218
-
219
- elif isinstance(element, marko.block.List):
220
- has_non_empty_list_items = False
221
- for child in element.children:
222
- if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
223
- has_non_empty_list_items = True
224
- break
225
-
226
- self._close_table(doc)
227
- self._process_inline_text(parent_item, doc)
228
- _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
229
- if has_non_empty_list_items:
230
- label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
231
- parent_item = doc.add_group(
232
- label=label, name=f"list", parent=parent_item
233
- )
234
-
235
- elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
236
- self._close_table(doc)
237
- self._process_inline_text(parent_item, doc)
238
- _log.debug(" - List item")
239
-
240
- first_child = element.children[0]
241
- snippet_text = str(first_child.children[0].children) # type: ignore
242
- is_numbered = False
243
- if (
244
- parent_item is not None
245
- and isinstance(parent_item, DocItem)
246
- and parent_item.label == GroupLabel.ORDERED_LIST
247
- ):
248
- is_numbered = True
249
- doc.add_list_item(
250
- enumerated=is_numbered, parent=parent_item, text=snippet_text
251
- )
252
- visited.add(first_child)
253
-
254
- elif isinstance(element, marko.inline.Image):
255
- self._close_table(doc)
256
- self._process_inline_text(parent_item, doc)
257
- _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
258
-
259
- fig_caption: Optional[TextItem] = None
260
- if element.title is not None and element.title != "":
261
- fig_caption = doc.add_text(
262
- label=DocItemLabel.CAPTION, text=element.title
263
- )
264
-
265
- doc.add_picture(parent=parent_item, caption=fig_caption)
266
-
267
- elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
268
- self._process_inline_text(parent_item, doc)
269
-
270
- elif isinstance(element, marko.inline.RawText):
271
- _log.debug(f" - Paragraph (raw text): {element.children}")
272
- snippet_text = element.children.strip()
273
- # Detect start of the table:
274
- if "|" in snippet_text:
275
- # most likely part of the markdown table
276
- self.in_table = True
277
- if len(self.md_table_buffer) > 0:
278
- self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
279
- else:
280
- self.md_table_buffer.append(snippet_text)
281
- else:
282
- self._close_table(doc)
283
- # most likely just inline text
284
- self.inline_texts.append(str(element.children))
285
-
286
- elif isinstance(element, marko.inline.CodeSpan):
287
- self._close_table(doc)
288
- self._process_inline_text(parent_item, doc)
289
- _log.debug(f" - Code Span: {element.children}")
290
- snippet_text = str(element.children).strip()
291
- doc.add_code(parent=parent_item, text=snippet_text)
292
-
293
- elif (
294
- isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
295
- and len(element.children) > 0
296
- and isinstance((first_child := element.children[0]), marko.inline.RawText)
297
- and len(snippet_text := (first_child.children.strip())) > 0
298
- ):
299
- self._close_table(doc)
300
- self._process_inline_text(parent_item, doc)
301
- _log.debug(f" - Code Block: {element.children}")
302
- doc.add_code(parent=parent_item, text=snippet_text)
303
-
304
- elif isinstance(element, marko.inline.LineBreak):
305
- if self.in_table:
306
- _log.debug("Line break in a table")
307
- self.md_table_buffer.append("")
308
-
309
- elif isinstance(element, marko.block.HTMLBlock):
310
- self._html_blocks += 1
311
- self._process_inline_text(parent_item, doc)
312
- self._close_table(doc)
313
- _log.debug("HTML Block: {}".format(element))
314
- if (
315
- len(element.body) > 0
316
- ): # If Marko doesn't return any content for HTML block, skip it
317
- html_block = element.body.strip()
318
-
319
- # wrap in markers to enable post-processing in convert()
320
- text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
321
- doc.add_code(parent=parent_item, text=text_to_add)
322
- else:
323
- if not isinstance(element, str):
324
- self._close_table(doc)
325
- _log.debug("Some other element: {}".format(element))
326
-
327
- processed_block_types = (
328
- marko.block.Heading,
329
- marko.block.CodeBlock,
330
- marko.block.FencedCode,
331
- marko.inline.RawText,
332
- )
333
-
334
- # Iterate through the element's children (if any)
335
- if hasattr(element, "children") and not isinstance(
336
- element, processed_block_types
337
- ):
338
- for child in element.children:
339
- self._iterate_elements(
340
- element=child,
341
- depth=depth + 1,
342
- doc=doc,
343
- visited=visited,
344
- parent_item=parent_item,
345
- )
346
-
347
- def is_valid(self) -> bool:
348
- return self.valid
349
-
350
- def unload(self):
351
- if isinstance(self.path_or_stream, BytesIO):
352
- self.path_or_stream.close()
353
- self.path_or_stream = None
354
-
355
- @classmethod
356
- def supports_pagination(cls) -> bool:
357
- return False
358
-
359
- @classmethod
360
- def supported_formats(cls) -> Set[InputFormat]:
361
- return {InputFormat.MD}
362
-
363
- def convert(self) -> DoclingDocument:
364
- _log.debug("converting Markdown...")
365
-
366
- origin = DocumentOrigin(
367
- filename=self.file.name or "file",
368
- mimetype="text/markdown",
369
- binary_hash=self.document_hash,
370
- )
371
-
372
- doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
373
-
374
- if self.is_valid():
375
- # Parse the markdown into an abstract syntax tree (AST)
376
- marko_parser = Markdown()
377
- parsed_ast = marko_parser.parse(self.markdown)
378
- # Start iterating from the root of the AST
379
- self._iterate_elements(
380
- element=parsed_ast,
381
- depth=0,
382
- doc=doc,
383
- parent_item=None,
384
- visited=set(),
385
- )
386
- self._process_inline_text(None, doc) # handle last hanging inline text
387
- self._close_table(doc=doc) # handle any last hanging table
388
-
389
- # if HTML blocks were detected, export to HTML and delegate to HTML backend
390
- if self._html_blocks > 0:
391
-
392
- # export to HTML
393
- html_backend_cls = HTMLDocumentBackend
394
- html_str = doc.export_to_html()
395
-
396
- def _restore_original_html(txt, regex):
397
- _txt, count = re.subn(regex, "", txt)
398
- if count != self._html_blocks:
399
- raise RuntimeError(
400
- "An internal error has occurred during Markdown conversion."
401
- )
402
- return _txt
403
-
404
- # restore original HTML by removing previouly added markers
405
- for regex in [
406
- rf"<pre>\s*<code>\s*{_START_MARKER}",
407
- rf"{_STOP_MARKER}\s*</code>\s*</pre>",
408
- ]:
409
- html_str = _restore_original_html(txt=html_str, regex=regex)
410
- self._html_blocks = 0
411
-
412
- # delegate to HTML backend
413
- stream = BytesIO(bytes(html_str, encoding="utf-8"))
414
- in_doc = InputDocument(
415
- path_or_stream=stream,
416
- format=InputFormat.HTML,
417
- backend=html_backend_cls,
418
- filename=self.file.name,
419
- )
420
- html_backend_obj = html_backend_cls(
421
- in_doc=in_doc, path_or_stream=stream
422
- )
423
- doc = html_backend_obj.convert()
424
- else:
425
- raise RuntimeError(
426
- f"Cannot convert md with {self.document_hash} because the backend failed to init."
427
- )
428
- return doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/msexcel_backend.py DELETED
@@ -1,386 +0,0 @@
1
- import logging
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import Dict, Set, Tuple, Union
5
-
6
- from docling_core.types.doc import (
7
- DoclingDocument,
8
- DocumentOrigin,
9
- GroupLabel,
10
- ImageRef,
11
- TableCell,
12
- TableData,
13
- )
14
-
15
- # from lxml import etree
16
- from openpyxl import Workbook, load_workbook
17
- from openpyxl.cell.cell import Cell
18
- from openpyxl.drawing.image import Image
19
- from openpyxl.worksheet.worksheet import Worksheet
20
-
21
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
22
- from docling.datamodel.base_models import InputFormat
23
- from docling.datamodel.document import InputDocument
24
-
25
- _log = logging.getLogger(__name__)
26
-
27
- from typing import Any, List
28
-
29
- from PIL import Image as PILImage
30
- from pydantic import BaseModel
31
-
32
-
33
- class ExcelCell(BaseModel):
34
- row: int
35
- col: int
36
- text: str
37
- row_span: int
38
- col_span: int
39
-
40
-
41
- class ExcelTable(BaseModel):
42
- num_rows: int
43
- num_cols: int
44
- data: List[ExcelCell]
45
-
46
-
47
- class MsExcelDocumentBackend(DeclarativeDocumentBackend):
48
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
49
- super().__init__(in_doc, path_or_stream)
50
-
51
- # Initialise the parents for the hierarchy
52
- self.max_levels = 10
53
-
54
- self.parents: Dict[int, Any] = {}
55
- for i in range(-1, self.max_levels):
56
- self.parents[i] = None
57
-
58
- self.workbook = None
59
- try:
60
- if isinstance(self.path_or_stream, BytesIO):
61
- self.workbook = load_workbook(filename=self.path_or_stream)
62
-
63
- elif isinstance(self.path_or_stream, Path):
64
- self.workbook = load_workbook(filename=str(self.path_or_stream))
65
-
66
- self.valid = True
67
- except Exception as e:
68
- self.valid = False
69
-
70
- raise RuntimeError(
71
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
72
- ) from e
73
-
74
- def is_valid(self) -> bool:
75
- _log.info(f"valid: {self.valid}")
76
- return self.valid
77
-
78
- @classmethod
79
- def supports_pagination(cls) -> bool:
80
- return True
81
-
82
- def unload(self):
83
- if isinstance(self.path_or_stream, BytesIO):
84
- self.path_or_stream.close()
85
-
86
- self.path_or_stream = None
87
-
88
- @classmethod
89
- def supported_formats(cls) -> Set[InputFormat]:
90
- return {InputFormat.XLSX}
91
-
92
- def convert(self) -> DoclingDocument:
93
- # Parses the XLSX into a structured document model.
94
-
95
- origin = DocumentOrigin(
96
- filename=self.file.name or "file.xlsx",
97
- mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
98
- binary_hash=self.document_hash,
99
- )
100
-
101
- doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
102
-
103
- if self.is_valid():
104
- doc = self._convert_workbook(doc)
105
- else:
106
- raise RuntimeError(
107
- f"Cannot convert doc with {self.document_hash} because the backend failed to init."
108
- )
109
-
110
- return doc
111
-
112
- def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
113
-
114
- if self.workbook is not None:
115
-
116
- # Iterate over all sheets
117
- for sheet_name in self.workbook.sheetnames:
118
- _log.info(f"Processing sheet: {sheet_name}")
119
-
120
- # Access the sheet by name
121
- sheet = self.workbook[sheet_name]
122
-
123
- self.parents[0] = doc.add_group(
124
- parent=None,
125
- label=GroupLabel.SECTION,
126
- name=f"sheet: {sheet_name}",
127
- )
128
-
129
- doc = self._convert_sheet(doc, sheet)
130
- else:
131
- _log.error("Workbook is not initialized.")
132
-
133
- return doc
134
-
135
- def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
136
-
137
- doc = self._find_tables_in_sheet(doc, sheet)
138
-
139
- doc = self._find_images_in_sheet(doc, sheet)
140
-
141
- return doc
142
-
143
- def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
144
-
145
- tables = self._find_data_tables(sheet)
146
-
147
- for excel_table in tables:
148
- num_rows = excel_table.num_rows
149
- num_cols = excel_table.num_cols
150
-
151
- table_data = TableData(
152
- num_rows=num_rows,
153
- num_cols=num_cols,
154
- table_cells=[],
155
- )
156
-
157
- for excel_cell in excel_table.data:
158
-
159
- cell = TableCell(
160
- text=excel_cell.text,
161
- row_span=excel_cell.row_span,
162
- col_span=excel_cell.col_span,
163
- start_row_offset_idx=excel_cell.row,
164
- end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
- start_col_offset_idx=excel_cell.col,
166
- end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
- col_header=False,
168
- row_header=False,
169
- )
170
- table_data.table_cells.append(cell)
171
-
172
- doc.add_table(data=table_data, parent=self.parents[0])
173
-
174
- return doc
175
-
176
- def _find_data_tables(self, sheet: Worksheet):
177
- """
178
- Find all compact rectangular data tables in a sheet.
179
- """
180
- # _log.info("find_data_tables")
181
-
182
- tables = [] # List to store found tables
183
- visited: set[Tuple[int, int]] = set() # Track already visited cells
184
-
185
- # Iterate over all cells in the sheet
186
- for ri, row in enumerate(sheet.iter_rows(values_only=False)):
187
- for rj, cell in enumerate(row):
188
-
189
- # Skip empty or already visited cells
190
- if cell.value is None or (ri, rj) in visited:
191
- continue
192
-
193
- # If the cell starts a new table, find its bounds
194
- table_bounds, visited_cells = self._find_table_bounds(
195
- sheet, ri, rj, visited
196
- )
197
-
198
- visited.update(visited_cells) # Mark these cells as visited
199
- tables.append(table_bounds)
200
-
201
- return tables
202
-
203
- def _find_table_bounds(
204
- self,
205
- sheet: Worksheet,
206
- start_row: int,
207
- start_col: int,
208
- visited: set[Tuple[int, int]],
209
- ):
210
- """
211
- Determine the bounds of a compact rectangular table.
212
- Returns:
213
- - A dictionary with the bounds and data.
214
- - A set of visited cell coordinates.
215
- """
216
- _log.info("find_table_bounds")
217
-
218
- max_row = self._find_table_bottom(sheet, start_row, start_col)
219
- max_col = self._find_table_right(sheet, start_row, start_col)
220
-
221
- # Collect the data within the bounds
222
- data = []
223
- visited_cells = set()
224
- for ri in range(start_row, max_row + 1):
225
- for rj in range(start_col, max_col + 1):
226
-
227
- cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
228
-
229
- # Check if the cell belongs to a merged range
230
- row_span = 1
231
- col_span = 1
232
-
233
- # _log.info(sheet.merged_cells.ranges)
234
- for merged_range in sheet.merged_cells.ranges:
235
-
236
- if (
237
- merged_range.min_row <= ri + 1
238
- and ri + 1 <= merged_range.max_row
239
- and merged_range.min_col <= rj + 1
240
- and rj + 1 <= merged_range.max_col
241
- ):
242
-
243
- row_span = merged_range.max_row - merged_range.min_row + 1
244
- col_span = merged_range.max_col - merged_range.min_col + 1
245
- break
246
-
247
- if (ri, rj) not in visited_cells:
248
- data.append(
249
- ExcelCell(
250
- row=ri - start_row,
251
- col=rj - start_col,
252
- text=str(cell.value),
253
- row_span=row_span,
254
- col_span=col_span,
255
- )
256
- )
257
- # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
258
-
259
- # Mark all cells in the span as visited
260
- for span_row in range(ri, ri + row_span):
261
- for span_col in range(rj, rj + col_span):
262
- visited_cells.add((span_row, span_col))
263
-
264
- return (
265
- ExcelTable(
266
- num_rows=max_row + 1 - start_row,
267
- num_cols=max_col + 1 - start_col,
268
- data=data,
269
- ),
270
- visited_cells,
271
- )
272
-
273
- def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
274
- """Function to find the bottom boundary of the table"""
275
-
276
- max_row = start_row
277
-
278
- while max_row < sheet.max_row - 1:
279
- # Get the cell value or check if it is part of a merged cell
280
- cell = sheet.cell(row=max_row + 2, column=start_col + 1)
281
-
282
- # Check if the cell is part of a merged range
283
- merged_range = next(
284
- (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
285
- None,
286
- )
287
-
288
- if cell.value is None and not merged_range:
289
- break # Stop if the cell is empty and not merged
290
-
291
- # Expand max_row to include the merged range if applicable
292
- if merged_range:
293
- max_row = max(max_row, merged_range.max_row - 1)
294
- else:
295
- max_row += 1
296
-
297
- return max_row
298
-
299
- def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
300
- """Function to find the right boundary of the table"""
301
-
302
- max_col = start_col
303
-
304
- while max_col < sheet.max_column - 1:
305
- # Get the cell value or check if it is part of a merged cell
306
- cell = sheet.cell(row=start_row + 1, column=max_col + 2)
307
-
308
- # Check if the cell is part of a merged range
309
- merged_range = next(
310
- (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
311
- None,
312
- )
313
-
314
- if cell.value is None and not merged_range:
315
- break # Stop if the cell is empty and not merged
316
-
317
- # Expand max_col to include the merged range if applicable
318
- if merged_range:
319
- max_col = max(max_col, merged_range.max_col - 1)
320
- else:
321
- max_col += 1
322
-
323
- return max_col
324
-
325
- def _find_images_in_sheet(
326
- self, doc: DoclingDocument, sheet: Worksheet
327
- ) -> DoclingDocument:
328
-
329
- # Iterate over byte images in the sheet
330
- for idx, image in enumerate(sheet._images): # type: ignore
331
-
332
- try:
333
- pil_image = PILImage.open(image.ref)
334
-
335
- doc.add_picture(
336
- parent=self.parents[0],
337
- image=ImageRef.from_pil(image=pil_image, dpi=72),
338
- caption=None,
339
- )
340
- except:
341
- _log.error("could not extract the image from excel sheets")
342
-
343
- """
344
- for idx, chart in enumerate(sheet._charts): # type: ignore
345
- try:
346
- chart_path = f"chart_{idx + 1}.png"
347
- _log.info(
348
- f"Chart found, but dynamic rendering is required for: {chart_path}"
349
- )
350
-
351
- _log.info(f"Chart {idx + 1}:")
352
-
353
- # Chart type
354
- # _log.info(f"Type: {type(chart).__name__}")
355
- print(f"Type: {type(chart).__name__}")
356
-
357
- # Extract series data
358
- for series_idx, series in enumerate(chart.series):
359
- #_log.info(f"Series {series_idx + 1}:")
360
- print(f"Series {series_idx + 1} type: {type(series).__name__}")
361
- #print(f"x-values: {series.xVal}")
362
- #print(f"y-values: {series.yVal}")
363
-
364
- print(f"xval type: {type(series.xVal).__name__}")
365
-
366
- xvals = []
367
- for _ in series.xVal.numLit.pt:
368
- print(f"xval type: {type(_).__name__}")
369
- if hasattr(_, 'v'):
370
- xvals.append(_.v)
371
-
372
- print(f"x-values: {xvals}")
373
-
374
- yvals = []
375
- for _ in series.yVal:
376
- if hasattr(_, 'v'):
377
- yvals.append(_.v)
378
-
379
- print(f"y-values: {yvals}")
380
-
381
- except Exception as exc:
382
- print(exc)
383
- continue
384
- """
385
-
386
- return doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/mspowerpoint_backend.py DELETED
@@ -1,424 +0,0 @@
1
- import logging
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import Set, Union
5
-
6
- from docling_core.types.doc import (
7
- BoundingBox,
8
- CoordOrigin,
9
- DocItemLabel,
10
- DoclingDocument,
11
- DocumentOrigin,
12
- GroupLabel,
13
- ImageRef,
14
- ProvenanceItem,
15
- Size,
16
- TableCell,
17
- TableData,
18
- )
19
- from PIL import Image, UnidentifiedImageError
20
- from pptx import Presentation
21
- from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
22
-
23
- from docling.backend.abstract_backend import (
24
- DeclarativeDocumentBackend,
25
- PaginatedDocumentBackend,
26
- )
27
- from docling.datamodel.base_models import InputFormat
28
- from docling.datamodel.document import InputDocument
29
-
30
- _log = logging.getLogger(__name__)
31
-
32
-
33
- class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
34
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
35
- super().__init__(in_doc, path_or_stream)
36
- self.namespaces = {
37
- "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
38
- "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
39
- "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
40
- }
41
- # Powerpoint file:
42
- self.path_or_stream = path_or_stream
43
-
44
- self.pptx_obj = None
45
- self.valid = False
46
- try:
47
- if isinstance(self.path_or_stream, BytesIO):
48
- self.pptx_obj = Presentation(self.path_or_stream)
49
- elif isinstance(self.path_or_stream, Path):
50
- self.pptx_obj = Presentation(str(self.path_or_stream))
51
-
52
- self.valid = True
53
- except Exception as e:
54
- raise RuntimeError(
55
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
56
- ) from e
57
-
58
- return
59
-
60
- def page_count(self) -> int:
61
- if self.is_valid():
62
- assert self.pptx_obj is not None
63
- return len(self.pptx_obj.slides)
64
- else:
65
- return 0
66
-
67
- def is_valid(self) -> bool:
68
- return self.valid
69
-
70
- @classmethod
71
- def supports_pagination(cls) -> bool:
72
- return True # True? if so, how to handle pages...
73
-
74
- def unload(self):
75
- if isinstance(self.path_or_stream, BytesIO):
76
- self.path_or_stream.close()
77
-
78
- self.path_or_stream = None
79
-
80
- @classmethod
81
- def supported_formats(cls) -> Set[InputFormat]:
82
- return {InputFormat.PPTX}
83
-
84
- def convert(self) -> DoclingDocument:
85
- # Parses the PPTX into a structured document model.
86
- # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
87
-
88
- origin = DocumentOrigin(
89
- filename=self.file.name or "file",
90
- mimetype="application/vnd.ms-powerpoint",
91
- binary_hash=self.document_hash,
92
- )
93
-
94
- doc = DoclingDocument(
95
- name=self.file.stem or "file", origin=origin
96
- ) # must add origin information
97
- doc = self.walk_linear(self.pptx_obj, doc)
98
-
99
- return doc
100
-
101
- def generate_prov(
102
- self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
103
- ):
104
- if shape.left:
105
- left = shape.left
106
- top = shape.top
107
- width = shape.width
108
- height = shape.height
109
- else:
110
- left = 0
111
- top = 0
112
- width = slide_size.width
113
- height = slide_size.height
114
- shape_bbox = [left, top, left + width, top + height]
115
- shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
116
- prov = ProvenanceItem(
117
- page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
118
- )
119
-
120
- return prov
121
-
122
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
123
- is_a_list = False
124
- is_list_group_created = False
125
- enum_list_item_value = 0
126
- new_list = None
127
- bullet_type = "None"
128
- list_text = ""
129
- list_label = GroupLabel.LIST
130
- doc_label = DocItemLabel.LIST_ITEM
131
- prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
132
-
133
- # Identify if shape contains lists
134
- for paragraph in shape.text_frame.paragraphs:
135
- # Check if paragraph is a bullet point using the `element` XML
136
- p = paragraph._element
137
- if (
138
- p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
139
- is not None
140
- ):
141
- bullet_type = "Bullet"
142
- is_a_list = True
143
- elif (
144
- p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
145
- is not None
146
- ):
147
- bullet_type = "Numbered"
148
- is_a_list = True
149
- else:
150
- is_a_list = False
151
-
152
- if paragraph.level > 0:
153
- # Most likely a sub-list
154
- is_a_list = True
155
-
156
- if is_a_list:
157
- # Determine if this is an unordered list or an ordered list.
158
- # Set GroupLabel.ORDERED_LIST when it fits.
159
- if bullet_type == "Numbered":
160
- list_label = GroupLabel.ORDERED_LIST
161
-
162
- if is_a_list:
163
- _log.debug("LIST DETECTED!")
164
- else:
165
- _log.debug("No List")
166
-
167
- # If there is a list inside of the shape, create a new docling list to assign list items to
168
- # if is_a_list:
169
- # new_list = doc.add_group(
170
- # label=list_label, name=f"list", parent=parent_slide
171
- # )
172
-
173
- # Iterate through paragraphs to build up text
174
- for paragraph in shape.text_frame.paragraphs:
175
- # p_text = paragraph.text.strip()
176
- p = paragraph._element
177
- enum_list_item_value += 1
178
- inline_paragraph_text = ""
179
- inline_list_item_text = ""
180
-
181
- for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
182
- if len(e.text.strip()) > 0:
183
- e_is_a_list_item = False
184
- is_numbered = False
185
- if (
186
- p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
187
- is not None
188
- ):
189
- bullet_type = "Bullet"
190
- e_is_a_list_item = True
191
- elif (
192
- p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
193
- is not None
194
- ):
195
- bullet_type = "Numbered"
196
- is_numbered = True
197
- e_is_a_list_item = True
198
- else:
199
- e_is_a_list_item = False
200
-
201
- if e_is_a_list_item:
202
- if len(inline_paragraph_text) > 0:
203
- # output accumulated inline text:
204
- doc.add_text(
205
- label=doc_label,
206
- parent=parent_slide,
207
- text=inline_paragraph_text,
208
- prov=prov,
209
- )
210
- # Set marker and enumerated arguments if this is an enumeration element.
211
- inline_list_item_text += e.text
212
- # print(e.text)
213
- else:
214
- # Assign proper label to the text, depending if it's a Title or Section Header
215
- # For other types of text, assign - PARAGRAPH
216
- doc_label = DocItemLabel.PARAGRAPH
217
- if shape.is_placeholder:
218
- placeholder_type = shape.placeholder_format.type
219
- if placeholder_type in [
220
- PP_PLACEHOLDER.CENTER_TITLE,
221
- PP_PLACEHOLDER.TITLE,
222
- ]:
223
- # It's a title
224
- doc_label = DocItemLabel.TITLE
225
- elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
226
- DocItemLabel.SECTION_HEADER
227
- enum_list_item_value = 0
228
- inline_paragraph_text += e.text
229
-
230
- if len(inline_paragraph_text) > 0:
231
- # output accumulated inline text:
232
- doc.add_text(
233
- label=doc_label,
234
- parent=parent_slide,
235
- text=inline_paragraph_text,
236
- prov=prov,
237
- )
238
-
239
- if len(inline_list_item_text) > 0:
240
- enum_marker = ""
241
- if is_numbered:
242
- enum_marker = str(enum_list_item_value) + "."
243
- if not is_list_group_created:
244
- new_list = doc.add_group(
245
- label=list_label, name=f"list", parent=parent_slide
246
- )
247
- is_list_group_created = True
248
- doc.add_list_item(
249
- marker=enum_marker,
250
- enumerated=is_numbered,
251
- parent=new_list,
252
- text=inline_list_item_text,
253
- prov=prov,
254
- )
255
- return
256
-
257
- def handle_title(self, shape, parent_slide, slide_ind, doc):
258
- placeholder_type = shape.placeholder_format.type
259
- txt = shape.text.strip()
260
- prov = self.generate_prov(shape, slide_ind, txt)
261
-
262
- if len(txt.strip()) > 0:
263
- # title = slide.shapes.title.text if slide.shapes.title else "No title"
264
- if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
265
- _log.info(f"Title found: {shape.text}")
266
- doc.add_text(
267
- label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
268
- )
269
- elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
270
- _log.info(f"Subtitle found: {shape.text}")
271
- # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
272
- doc.add_text(
273
- label=DocItemLabel.SECTION_HEADER,
274
- parent=parent_slide,
275
- text=txt,
276
- prov=prov,
277
- )
278
- return
279
-
280
- def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
281
- # Open it with PIL
282
- try:
283
- # Get the image bytes
284
- image = shape.image
285
- image_bytes = image.blob
286
- im_dpi, _ = image.dpi
287
- pil_image = Image.open(BytesIO(image_bytes))
288
-
289
- # shape has picture
290
- prov = self.generate_prov(shape, slide_ind, "", slide_size)
291
- doc.add_picture(
292
- parent=parent_slide,
293
- image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
294
- caption=None,
295
- prov=prov,
296
- )
297
- except (UnidentifiedImageError, OSError) as e:
298
- _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
299
- return
300
-
301
- def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
302
- # Handling tables, images, charts
303
- if shape.has_table:
304
- table = shape.table
305
- table_xml = shape._element
306
-
307
- prov = self.generate_prov(shape, slide_ind, "", slide_size)
308
-
309
- num_cols = 0
310
- num_rows = len(table.rows)
311
- tcells = []
312
- # Access the XML element for the shape that contains the table
313
- table_xml = shape._element
314
-
315
- for row_idx, row in enumerate(table.rows):
316
- if len(row.cells) > num_cols:
317
- num_cols = len(row.cells)
318
- for col_idx, cell in enumerate(row.cells):
319
- # Access the XML of the cell (this is the 'tc' element in table XML)
320
- cell_xml = table_xml.xpath(
321
- f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
322
- )
323
-
324
- if not cell_xml:
325
- continue # If no cell XML is found, skip
326
-
327
- cell_xml = cell_xml[0] # Get the first matching XML node
328
- row_span = cell_xml.get("rowSpan") # Vertical span
329
- col_span = cell_xml.get("gridSpan") # Horizontal span
330
-
331
- if row_span is None:
332
- row_span = 1
333
- else:
334
- row_span = int(row_span)
335
-
336
- if col_span is None:
337
- col_span = 1
338
- else:
339
- col_span = int(col_span)
340
-
341
- icell = TableCell(
342
- text=cell.text.strip(),
343
- row_span=row_span,
344
- col_span=col_span,
345
- start_row_offset_idx=row_idx,
346
- end_row_offset_idx=row_idx + row_span,
347
- start_col_offset_idx=col_idx,
348
- end_col_offset_idx=col_idx + col_span,
349
- col_header=False,
350
- row_header=False,
351
- )
352
- if len(cell.text.strip()) > 0:
353
- tcells.append(icell)
354
- # Initialize Docling TableData
355
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
356
- # Populate
357
- for tcell in tcells:
358
- data.table_cells.append(tcell)
359
- if len(tcells) > 0:
360
- # If table is not fully empty...
361
- # Create Docling table
362
- doc.add_table(parent=parent_slide, data=data, prov=prov)
363
- return
364
-
365
- def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
366
- # Units of size in PPTX by default are EMU units (English Metric Units)
367
- slide_width = pptx_obj.slide_width
368
- slide_height = pptx_obj.slide_height
369
-
370
- text_content = [] # type: ignore
371
-
372
- max_levels = 10
373
- parents = {} # type: ignore
374
- for i in range(0, max_levels):
375
- parents[i] = None
376
-
377
- # Loop through each slide
378
- for slide_num, slide in enumerate(pptx_obj.slides):
379
- slide_ind = pptx_obj.slides.index(slide)
380
- parent_slide = doc.add_group(
381
- name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
382
- )
383
-
384
- slide_size = Size(width=slide_width, height=slide_height)
385
- parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
386
-
387
- def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
388
- handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
389
- if shape.has_table:
390
- # Handle Tables
391
- self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
392
- if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
393
- # Handle Pictures
394
- self.handle_pictures(
395
- shape, parent_slide, slide_ind, doc, slide_size
396
- )
397
- # If shape doesn't have any text, move on to the next shape
398
- if not hasattr(shape, "text"):
399
- return
400
- if shape.text is None:
401
- return
402
- if len(shape.text.strip()) == 0:
403
- return
404
- if not shape.has_text_frame:
405
- _log.warning("Warning: shape has text but not text_frame")
406
- return
407
- # Handle other text elements, including lists (bullet lists, numbered lists)
408
- self.handle_text_elements(
409
- shape, parent_slide, slide_ind, doc, slide_size
410
- )
411
- return
412
-
413
- def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
414
- if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
415
- for groupedshape in shape.shapes:
416
- handle_shapes(
417
- groupedshape, parent_slide, slide_ind, doc, slide_size
418
- )
419
-
420
- # Loop through each shape in the slide
421
- for shape in slide.shapes:
422
- handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
423
-
424
- return doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/msword_backend.py DELETED
@@ -1,582 +0,0 @@
1
- import logging
2
- import re
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import Any, Optional, Union
6
-
7
- from docling_core.types.doc import (
8
- DocItemLabel,
9
- DoclingDocument,
10
- DocumentOrigin,
11
- GroupLabel,
12
- ImageRef,
13
- NodeItem,
14
- TableCell,
15
- TableData,
16
- )
17
- from docx import Document
18
- from docx.document import Document as DocxDocument
19
- from docx.oxml.table import CT_Tc
20
- from docx.oxml.xmlchemy import BaseOxmlElement
21
- from docx.table import Table, _Cell
22
- from docx.text.paragraph import Paragraph
23
- from lxml import etree
24
- from lxml.etree import XPath
25
- from PIL import Image, UnidentifiedImageError
26
- from typing_extensions import override
27
-
28
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
29
- from docling.datamodel.base_models import InputFormat
30
- from docling.datamodel.document import InputDocument
31
-
32
- _log = logging.getLogger(__name__)
33
-
34
-
35
- class MsWordDocumentBackend(DeclarativeDocumentBackend):
36
- @override
37
- def __init__(
38
- self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
39
- ) -> None:
40
- super().__init__(in_doc, path_or_stream)
41
- self.XML_KEY = (
42
- "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
43
- )
44
- self.xml_namespaces = {
45
- "w": "http://schemas.microsoft.com/office/word/2003/wordml"
46
- }
47
- # self.initialise(path_or_stream)
48
- # Word file:
49
- self.path_or_stream: Union[BytesIO, Path] = path_or_stream
50
- self.valid: bool = False
51
- # Initialise the parents for the hierarchy
52
- self.max_levels: int = 10
53
- self.level_at_new_list: Optional[int] = None
54
- self.parents: dict[int, Optional[NodeItem]] = {}
55
- for i in range(-1, self.max_levels):
56
- self.parents[i] = None
57
-
58
- self.level = 0
59
- self.listIter = 0
60
-
61
- self.history: dict[str, Any] = {
62
- "names": [None],
63
- "levels": [None],
64
- "numids": [None],
65
- "indents": [None],
66
- }
67
-
68
- self.docx_obj = None
69
- try:
70
- if isinstance(self.path_or_stream, BytesIO):
71
- self.docx_obj = Document(self.path_or_stream)
72
- elif isinstance(self.path_or_stream, Path):
73
- self.docx_obj = Document(str(self.path_or_stream))
74
-
75
- self.valid = True
76
- except Exception as e:
77
- raise RuntimeError(
78
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
79
- ) from e
80
-
81
- @override
82
- def is_valid(self) -> bool:
83
- return self.valid
84
-
85
- @classmethod
86
- @override
87
- def supports_pagination(cls) -> bool:
88
- return False
89
-
90
- @override
91
- def unload(self):
92
- if isinstance(self.path_or_stream, BytesIO):
93
- self.path_or_stream.close()
94
-
95
- self.path_or_stream = None
96
-
97
- @classmethod
98
- @override
99
- def supported_formats(cls) -> set[InputFormat]:
100
- return {InputFormat.DOCX}
101
-
102
- @override
103
- def convert(self) -> DoclingDocument:
104
- """Parses the DOCX into a structured document model.
105
-
106
- Returns:
107
- The parsed document.
108
- """
109
-
110
- origin = DocumentOrigin(
111
- filename=self.file.name or "file",
112
- mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
113
- binary_hash=self.document_hash,
114
- )
115
-
116
- doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
117
- if self.is_valid():
118
- assert self.docx_obj is not None
119
- doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
120
- return doc
121
- else:
122
- raise RuntimeError(
123
- f"Cannot convert doc with {self.document_hash} because the backend failed to init."
124
- )
125
-
126
- def update_history(
127
- self,
128
- name: str,
129
- level: Optional[int],
130
- numid: Optional[int],
131
- ilevel: Optional[int],
132
- ):
133
- self.history["names"].append(name)
134
- self.history["levels"].append(level)
135
-
136
- self.history["numids"].append(numid)
137
- self.history["indents"].append(ilevel)
138
-
139
- def prev_name(self) -> Optional[str]:
140
- return self.history["names"][-1]
141
-
142
- def prev_level(self) -> Optional[int]:
143
- return self.history["levels"][-1]
144
-
145
- def prev_numid(self) -> Optional[int]:
146
- return self.history["numids"][-1]
147
-
148
- def prev_indent(self) -> Optional[int]:
149
- return self.history["indents"][-1]
150
-
151
- def get_level(self) -> int:
152
- """Return the first None index."""
153
- for k, v in self.parents.items():
154
- if k >= 0 and v == None:
155
- return k
156
- return 0
157
-
158
- def walk_linear(
159
- self,
160
- body: BaseOxmlElement,
161
- docx_obj: DocxDocument,
162
- doc: DoclingDocument,
163
- ) -> DoclingDocument:
164
- for element in body:
165
- tag_name = etree.QName(element).localname
166
- # Check for Inline Images (blip elements)
167
- namespaces = {
168
- "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
169
- "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
170
- "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
171
- }
172
- xpath_expr = XPath(".//a:blip", namespaces=namespaces)
173
- drawing_blip = xpath_expr(element)
174
-
175
- # Check for Tables
176
- if element.tag.endswith("tbl"):
177
- try:
178
- self.handle_tables(element, docx_obj, doc)
179
- except Exception:
180
- _log.debug("could not parse a table, broken docx table")
181
-
182
- elif drawing_blip:
183
- self.handle_pictures(docx_obj, drawing_blip, doc)
184
- # Check for the sdt containers, like table of contents
185
- elif tag_name in ["sdt"]:
186
- sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
187
- if sdt_content is not None:
188
- # Iterate paragraphs, runs, or text inside <w:sdtContent>.
189
- paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
190
- for p in paragraphs:
191
- self.handle_text_elements(p, docx_obj, doc)
192
- # Check for Text
193
- elif tag_name in ["p"]:
194
- # "tcPr", "sectPr"
195
- self.handle_text_elements(element, docx_obj, doc)
196
- else:
197
- _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
198
- return doc
199
-
200
- def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
201
- if s is None:
202
- return None
203
- try:
204
- return int(s)
205
- except ValueError:
206
- return default
207
-
208
- def split_text_and_number(self, input_string: str) -> list[str]:
209
- match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
210
- if match:
211
- parts = list(filter(None, match.groups()))
212
- return parts
213
- else:
214
- return [input_string]
215
-
216
- def get_numId_and_ilvl(
217
- self, paragraph: Paragraph
218
- ) -> tuple[Optional[int], Optional[int]]:
219
- # Access the XML element of the paragraph
220
- numPr = paragraph._element.find(
221
- ".//w:numPr", namespaces=paragraph._element.nsmap
222
- )
223
-
224
- if numPr is not None:
225
- # Get the numId element and extract the value
226
- numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
227
- ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
228
- numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
229
- ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
230
-
231
- return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
232
-
233
- return None, None # If the paragraph is not part of a list
234
-
235
- def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
236
- if paragraph.style is None:
237
- return "Normal", None
238
- label = paragraph.style.style_id
239
- if label is None:
240
- return "Normal", None
241
- if ":" in label:
242
- parts = label.split(":")
243
-
244
- if len(parts) == 2:
245
- return parts[0], self.str_to_int(parts[1], None)
246
-
247
- parts = self.split_text_and_number(label)
248
-
249
- if "Heading" in label and len(parts) == 2:
250
- parts.sort()
251
- label_str: str = ""
252
- label_level: Optional[int] = 0
253
- if parts[0] == "Heading":
254
- label_str = parts[0]
255
- label_level = self.str_to_int(parts[1], None)
256
- if parts[1] == "Heading":
257
- label_str = parts[1]
258
- label_level = self.str_to_int(parts[0], None)
259
- return label_str, label_level
260
- else:
261
- return label, None
262
-
263
- def handle_text_elements(
264
- self,
265
- element: BaseOxmlElement,
266
- docx_obj: DocxDocument,
267
- doc: DoclingDocument,
268
- ) -> None:
269
- paragraph = Paragraph(element, docx_obj)
270
-
271
- if paragraph.text is None:
272
- return
273
- text = paragraph.text.strip()
274
-
275
- # Common styles for bullet and numbered lists.
276
- # "List Bullet", "List Number", "List Paragraph"
277
- # Identify wether list is a numbered list or not
278
- # is_numbered = "List Bullet" not in paragraph.style.name
279
- is_numbered = False
280
- p_style_id, p_level = self.get_label_and_level(paragraph)
281
- numid, ilevel = self.get_numId_and_ilvl(paragraph)
282
-
283
- if numid == 0:
284
- numid = None
285
-
286
- # Handle lists
287
- if (
288
- numid is not None
289
- and ilevel is not None
290
- and p_style_id not in ["Title", "Heading"]
291
- ):
292
- self.add_listitem(
293
- doc,
294
- numid,
295
- ilevel,
296
- text,
297
- is_numbered,
298
- )
299
- self.update_history(p_style_id, p_level, numid, ilevel)
300
- return
301
- elif (
302
- numid is None
303
- and self.prev_numid() is not None
304
- and p_style_id not in ["Title", "Heading"]
305
- ): # Close list
306
- if self.level_at_new_list:
307
- for key in range(len(self.parents)):
308
- if key >= self.level_at_new_list:
309
- self.parents[key] = None
310
- self.level = self.level_at_new_list - 1
311
- self.level_at_new_list = None
312
- else:
313
- for key in range(len(self.parents)):
314
- self.parents[key] = None
315
- self.level = 0
316
-
317
- if p_style_id in ["Title"]:
318
- for key in range(len(self.parents)):
319
- self.parents[key] = None
320
- self.parents[0] = doc.add_text(
321
- parent=None, label=DocItemLabel.TITLE, text=text
322
- )
323
- elif "Heading" in p_style_id:
324
- self.add_header(doc, p_level, text)
325
-
326
- elif p_style_id in [
327
- "Paragraph",
328
- "Normal",
329
- "Subtitle",
330
- "Author",
331
- "DefaultText",
332
- "ListParagraph",
333
- "ListBullet",
334
- "Quote",
335
- ]:
336
- level = self.get_level()
337
- doc.add_text(
338
- label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
339
- )
340
-
341
- else:
342
- # Text style names can, and will have, not only default values but user values too
343
- # hence we treat all other labels as pure text
344
- level = self.get_level()
345
- doc.add_text(
346
- label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
347
- )
348
-
349
- self.update_history(p_style_id, p_level, numid, ilevel)
350
- return
351
-
352
- def add_header(
353
- self, doc: DoclingDocument, curr_level: Optional[int], text: str
354
- ) -> None:
355
- level = self.get_level()
356
- if isinstance(curr_level, int):
357
- if curr_level > level:
358
- # add invisible group
359
- for i in range(level, curr_level):
360
- self.parents[i] = doc.add_group(
361
- parent=self.parents[i - 1],
362
- label=GroupLabel.SECTION,
363
- name=f"header-{i}",
364
- )
365
- elif curr_level < level:
366
- # remove the tail
367
- for key in range(len(self.parents)):
368
- if key >= curr_level:
369
- self.parents[key] = None
370
-
371
- self.parents[curr_level] = doc.add_heading(
372
- parent=self.parents[curr_level - 1],
373
- text=text,
374
- level=curr_level,
375
- )
376
- else:
377
- self.parents[self.level] = doc.add_heading(
378
- parent=self.parents[self.level - 1],
379
- text=text,
380
- level=1,
381
- )
382
- return
383
-
384
- def add_listitem(
385
- self,
386
- doc: DoclingDocument,
387
- numid: int,
388
- ilevel: int,
389
- text: str,
390
- is_numbered: bool = False,
391
- ) -> None:
392
- enum_marker = ""
393
-
394
- level = self.get_level()
395
- prev_indent = self.prev_indent()
396
- if self.prev_numid() is None: # Open new list
397
- self.level_at_new_list = level
398
-
399
- self.parents[level] = doc.add_group(
400
- label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
401
- )
402
-
403
- # Set marker and enumerated arguments if this is an enumeration element.
404
- self.listIter += 1
405
- if is_numbered:
406
- enum_marker = str(self.listIter) + "."
407
- is_numbered = True
408
- doc.add_list_item(
409
- marker=enum_marker,
410
- enumerated=is_numbered,
411
- parent=self.parents[level],
412
- text=text,
413
- )
414
-
415
- elif (
416
- self.prev_numid() == numid
417
- and self.level_at_new_list is not None
418
- and prev_indent is not None
419
- and prev_indent < ilevel
420
- ): # Open indented list
421
- for i in range(
422
- self.level_at_new_list + prev_indent + 1,
423
- self.level_at_new_list + ilevel + 1,
424
- ):
425
- # Determine if this is an unordered list or an ordered list.
426
- # Set GroupLabel.ORDERED_LIST when it fits.
427
- self.listIter = 0
428
- if is_numbered:
429
- self.parents[i] = doc.add_group(
430
- label=GroupLabel.ORDERED_LIST,
431
- name="list",
432
- parent=self.parents[i - 1],
433
- )
434
- else:
435
- self.parents[i] = doc.add_group(
436
- label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
437
- )
438
-
439
- # TODO: Set marker and enumerated arguments if this is an enumeration element.
440
- self.listIter += 1
441
- if is_numbered:
442
- enum_marker = str(self.listIter) + "."
443
- is_numbered = True
444
- doc.add_list_item(
445
- marker=enum_marker,
446
- enumerated=is_numbered,
447
- parent=self.parents[self.level_at_new_list + ilevel],
448
- text=text,
449
- )
450
-
451
- elif (
452
- self.prev_numid() == numid
453
- and self.level_at_new_list is not None
454
- and prev_indent is not None
455
- and ilevel < prev_indent
456
- ): # Close list
457
- for k, v in self.parents.items():
458
- if k > self.level_at_new_list + ilevel:
459
- self.parents[k] = None
460
-
461
- # TODO: Set marker and enumerated arguments if this is an enumeration element.
462
- self.listIter += 1
463
- if is_numbered:
464
- enum_marker = str(self.listIter) + "."
465
- is_numbered = True
466
- doc.add_list_item(
467
- marker=enum_marker,
468
- enumerated=is_numbered,
469
- parent=self.parents[self.level_at_new_list + ilevel],
470
- text=text,
471
- )
472
- self.listIter = 0
473
-
474
- elif self.prev_numid() == numid or prev_indent == ilevel:
475
- # TODO: Set marker and enumerated arguments if this is an enumeration element.
476
- self.listIter += 1
477
- if is_numbered:
478
- enum_marker = str(self.listIter) + "."
479
- is_numbered = True
480
- doc.add_list_item(
481
- marker=enum_marker,
482
- enumerated=is_numbered,
483
- parent=self.parents[level - 1],
484
- text=text,
485
- )
486
- return
487
-
488
- def handle_tables(
489
- self,
490
- element: BaseOxmlElement,
491
- docx_obj: DocxDocument,
492
- doc: DoclingDocument,
493
- ) -> None:
494
- table: Table = Table(element, docx_obj)
495
- num_rows = len(table.rows)
496
- num_cols = len(table.columns)
497
- _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
498
-
499
- if num_rows == 1 and num_cols == 1:
500
- cell_element = table.rows[0].cells[0]
501
- # In case we have a table of only 1 cell, we consider it furniture
502
- # And proceed processing the content of the cell as though it's in the document body
503
- self.walk_linear(cell_element._element, docx_obj, doc)
504
- return
505
-
506
- data = TableData(num_rows=num_rows, num_cols=num_cols)
507
- cell_set: set[CT_Tc] = set()
508
- for row_idx, row in enumerate(table.rows):
509
- _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
510
- col_idx = 0
511
- while col_idx < num_cols:
512
- cell: _Cell = row.cells[col_idx]
513
- _log.debug(
514
- f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
515
- )
516
- if cell is None or cell._tc in cell_set:
517
- _log.debug(f" skipped since repeated content")
518
- col_idx += cell.grid_span
519
- continue
520
- else:
521
- cell_set.add(cell._tc)
522
-
523
- spanned_idx = row_idx
524
- spanned_tc: Optional[CT_Tc] = cell._tc
525
- while spanned_tc == cell._tc:
526
- spanned_idx += 1
527
- spanned_tc = (
528
- table.rows[spanned_idx].cells[col_idx]._tc
529
- if spanned_idx < num_rows
530
- else None
531
- )
532
- _log.debug(f" spanned before row {spanned_idx}")
533
-
534
- table_cell = TableCell(
535
- text=cell.text,
536
- row_span=spanned_idx - row_idx,
537
- col_span=cell.grid_span,
538
- start_row_offset_idx=row.grid_cols_before + row_idx,
539
- end_row_offset_idx=row.grid_cols_before + spanned_idx,
540
- start_col_offset_idx=col_idx,
541
- end_col_offset_idx=col_idx + cell.grid_span,
542
- col_header=False,
543
- row_header=False,
544
- )
545
- data.table_cells.append(table_cell)
546
- col_idx += cell.grid_span
547
-
548
- level = self.get_level()
549
- doc.add_table(data=data, parent=self.parents[level - 1])
550
- return
551
-
552
- def handle_pictures(
553
- self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
554
- ) -> None:
555
- def get_docx_image(drawing_blip):
556
- rId = drawing_blip[0].get(
557
- "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
558
- )
559
- if rId in docx_obj.part.rels:
560
- # Access the image part using the relationship ID
561
- image_part = docx_obj.part.rels[rId].target_part
562
- image_data = image_part.blob # Get the binary image data
563
- return image_data
564
-
565
- level = self.get_level()
566
- # Open the BytesIO object with PIL to create an Image
567
- try:
568
- image_data = get_docx_image(drawing_blip)
569
- image_bytes = BytesIO(image_data)
570
- pil_image = Image.open(image_bytes)
571
- doc.add_picture(
572
- parent=self.parents[level - 1],
573
- image=ImageRef.from_pil(image=pil_image, dpi=72),
574
- caption=None,
575
- )
576
- except (UnidentifiedImageError, OSError) as e:
577
- _log.warning("Warning: image cannot be loaded by Pillow")
578
- doc.add_picture(
579
- parent=self.parents[level - 1],
580
- caption=None,
581
- )
582
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/pdf_backend.py DELETED
@@ -1,76 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import Iterable, Optional, Set, Union
5
-
6
- from docling_core.types.doc import BoundingBox, Size
7
- from PIL import Image
8
-
9
- from docling.backend.abstract_backend import PaginatedDocumentBackend
10
- from docling.datamodel.base_models import Cell, InputFormat
11
- from docling.datamodel.document import InputDocument
12
-
13
-
14
- class PdfPageBackend(ABC):
15
- @abstractmethod
16
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
17
- pass
18
-
19
- @abstractmethod
20
- def get_text_cells(self) -> Iterable[Cell]:
21
- pass
22
-
23
- @abstractmethod
24
- def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
25
- pass
26
-
27
- @abstractmethod
28
- def get_page_image(
29
- self, scale: float = 1, cropbox: Optional[BoundingBox] = None
30
- ) -> Image.Image:
31
- pass
32
-
33
- @abstractmethod
34
- def get_size(self) -> Size:
35
- pass
36
-
37
- @abstractmethod
38
- def is_valid(self) -> bool:
39
- pass
40
-
41
- @abstractmethod
42
- def unload(self):
43
- pass
44
-
45
-
46
- class PdfDocumentBackend(PaginatedDocumentBackend):
47
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
48
- super().__init__(in_doc, path_or_stream)
49
-
50
- if self.input_format is not InputFormat.PDF:
51
- if self.input_format is InputFormat.IMAGE:
52
- buf = BytesIO()
53
- img = Image.open(self.path_or_stream)
54
- img.save(buf, "PDF")
55
- buf.seek(0)
56
- self.path_or_stream = buf
57
- else:
58
- raise RuntimeError(
59
- f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
60
- )
61
-
62
- @abstractmethod
63
- def load_page(self, page_no: int) -> PdfPageBackend:
64
- pass
65
-
66
- @abstractmethod
67
- def page_count(self) -> int:
68
- pass
69
-
70
- @classmethod
71
- def supported_formats(cls) -> Set[InputFormat]:
72
- return {InputFormat.PDF}
73
-
74
- @classmethod
75
- def supports_pagination(cls) -> bool:
76
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/pypdfium2_backend.py DELETED
@@ -1,260 +0,0 @@
1
- import logging
2
- import random
3
- from io import BytesIO
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
-
7
- import pypdfium2 as pdfium
8
- import pypdfium2.raw as pdfium_c
9
- from docling_core.types.doc import BoundingBox, CoordOrigin, Size
10
- from PIL import Image, ImageDraw
11
- from pypdfium2 import PdfTextPage
12
- from pypdfium2._helpers.misc import PdfiumError
13
-
14
- from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
- from docling.datamodel.base_models import Cell
16
-
17
- if TYPE_CHECKING:
18
- from docling.datamodel.document import InputDocument
19
-
20
- _log = logging.getLogger(__name__)
21
-
22
-
23
- class PyPdfiumPageBackend(PdfPageBackend):
24
- def __init__(
25
- self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
26
- ):
27
- self.valid = True # No better way to tell from pypdfium.
28
- try:
29
- self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
30
- except PdfiumError as e:
31
- _log.info(
32
- f"An exception occurred when loading page {page_no} of document {document_hash}.",
33
- exc_info=True,
34
- )
35
- self.valid = False
36
- self.text_page: Optional[PdfTextPage] = None
37
-
38
- def is_valid(self) -> bool:
39
- return self.valid
40
-
41
- def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
42
- AREA_THRESHOLD = 0 # 32 * 32
43
- for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
44
- pos = obj.get_pos()
45
- cropbox = BoundingBox.from_tuple(
46
- pos, origin=CoordOrigin.BOTTOMLEFT
47
- ).to_top_left_origin(page_height=self.get_size().height)
48
-
49
- if cropbox.area() > AREA_THRESHOLD:
50
- cropbox = cropbox.scaled(scale=scale)
51
-
52
- yield cropbox
53
-
54
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
55
- if not self.text_page:
56
- self.text_page = self._ppage.get_textpage()
57
-
58
- if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
59
- bbox = bbox.to_bottom_left_origin(self.get_size().height)
60
-
61
- text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
62
-
63
- return text_piece
64
-
65
- def get_text_cells(self) -> Iterable[Cell]:
66
- if not self.text_page:
67
- self.text_page = self._ppage.get_textpage()
68
-
69
- cells = []
70
- cell_counter = 0
71
-
72
- page_size = self.get_size()
73
-
74
- for i in range(self.text_page.count_rects()):
75
- rect = self.text_page.get_rect(i)
76
- text_piece = self.text_page.get_text_bounded(*rect)
77
- x0, y0, x1, y1 = rect
78
- cells.append(
79
- Cell(
80
- id=cell_counter,
81
- text=text_piece,
82
- bbox=BoundingBox(
83
- l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
84
- ).to_top_left_origin(page_size.height),
85
- )
86
- )
87
- cell_counter += 1
88
-
89
- # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
90
- # The cell merging code below is to clean this up.
91
- def merge_horizontal_cells(
92
- cells: List[Cell],
93
- horizontal_threshold_factor: float = 1.0,
94
- vertical_threshold_factor: float = 0.5,
95
- ) -> List[Cell]:
96
- if not cells:
97
- return []
98
-
99
- def group_rows(cells: List[Cell]) -> List[List[Cell]]:
100
- rows = []
101
- current_row = [cells[0]]
102
- row_top = cells[0].bbox.t
103
- row_bottom = cells[0].bbox.b
104
- row_height = cells[0].bbox.height
105
-
106
- for cell in cells[1:]:
107
- vertical_threshold = row_height * vertical_threshold_factor
108
- if (
109
- abs(cell.bbox.t - row_top) <= vertical_threshold
110
- and abs(cell.bbox.b - row_bottom) <= vertical_threshold
111
- ):
112
- current_row.append(cell)
113
- row_top = min(row_top, cell.bbox.t)
114
- row_bottom = max(row_bottom, cell.bbox.b)
115
- row_height = row_bottom - row_top
116
- else:
117
- rows.append(current_row)
118
- current_row = [cell]
119
- row_top = cell.bbox.t
120
- row_bottom = cell.bbox.b
121
- row_height = cell.bbox.height
122
-
123
- if current_row:
124
- rows.append(current_row)
125
-
126
- return rows
127
-
128
- def merge_row(row: List[Cell]) -> List[Cell]:
129
- merged = []
130
- current_group = [row[0]]
131
-
132
- for cell in row[1:]:
133
- prev_cell = current_group[-1]
134
- avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
135
- if (
136
- cell.bbox.l - prev_cell.bbox.r
137
- <= avg_height * horizontal_threshold_factor
138
- ):
139
- current_group.append(cell)
140
- else:
141
- merged.append(merge_group(current_group))
142
- current_group = [cell]
143
-
144
- if current_group:
145
- merged.append(merge_group(current_group))
146
-
147
- return merged
148
-
149
- def merge_group(group: List[Cell]) -> Cell:
150
- if len(group) == 1:
151
- return group[0]
152
-
153
- merged_text = "".join(cell.text for cell in group)
154
- merged_bbox = BoundingBox(
155
- l=min(cell.bbox.l for cell in group),
156
- t=min(cell.bbox.t for cell in group),
157
- r=max(cell.bbox.r for cell in group),
158
- b=max(cell.bbox.b for cell in group),
159
- )
160
- return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
161
-
162
- rows = group_rows(cells)
163
- merged_cells = [cell for row in rows for cell in merge_row(row)]
164
-
165
- for i, cell in enumerate(merged_cells, 1):
166
- cell.id = i
167
-
168
- return merged_cells
169
-
170
- def draw_clusters_and_cells():
171
- image = (
172
- self.get_page_image()
173
- ) # make new image to avoid drawing on the saved ones
174
- draw = ImageDraw.Draw(image)
175
- for c in cells:
176
- x0, y0, x1, y1 = c.bbox.as_tuple()
177
- cell_color = (
178
- random.randint(30, 140),
179
- random.randint(30, 140),
180
- random.randint(30, 140),
181
- )
182
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
183
- image.show()
184
-
185
- # before merge:
186
- # draw_clusters_and_cells()
187
-
188
- cells = merge_horizontal_cells(cells)
189
-
190
- # after merge:
191
- # draw_clusters_and_cells()
192
-
193
- return cells
194
-
195
- def get_page_image(
196
- self, scale: float = 1, cropbox: Optional[BoundingBox] = None
197
- ) -> Image.Image:
198
-
199
- page_size = self.get_size()
200
-
201
- if not cropbox:
202
- cropbox = BoundingBox(
203
- l=0,
204
- r=page_size.width,
205
- t=0,
206
- b=page_size.height,
207
- coord_origin=CoordOrigin.TOPLEFT,
208
- )
209
- padbox = BoundingBox(
210
- l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
211
- )
212
- else:
213
- padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
214
- padbox.r = page_size.width - padbox.r
215
- padbox.t = page_size.height - padbox.t
216
-
217
- image = (
218
- self._ppage.render(
219
- scale=scale * 1.5,
220
- rotation=0, # no additional rotation
221
- crop=padbox.as_tuple(),
222
- )
223
- .to_pil()
224
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
225
- ) # We resize the image from 1.5x the given scale to make it sharper.
226
-
227
- return image
228
-
229
- def get_size(self) -> Size:
230
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
231
-
232
- def unload(self):
233
- self._ppage = None
234
- self.text_page = None
235
-
236
-
237
- class PyPdfiumDocumentBackend(PdfDocumentBackend):
238
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
239
- super().__init__(in_doc, path_or_stream)
240
-
241
- try:
242
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
243
- except PdfiumError as e:
244
- raise RuntimeError(
245
- f"pypdfium could not load document with hash {self.document_hash}"
246
- ) from e
247
-
248
- def page_count(self) -> int:
249
- return len(self._pdoc)
250
-
251
- def load_page(self, page_no: int) -> PyPdfiumPageBackend:
252
- return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
253
-
254
- def is_valid(self) -> bool:
255
- return self.page_count() > 0
256
-
257
- def unload(self):
258
- super().unload()
259
- self._pdoc.close()
260
- self._pdoc = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/pubmed_backend.py DELETED
@@ -1,592 +0,0 @@
1
- import logging
2
- from io import BytesIO
3
- from pathlib import Path
4
- from typing import Any, Set, Union
5
-
6
- import lxml
7
- from bs4 import BeautifulSoup
8
- from docling_core.types.doc import (
9
- DocItemLabel,
10
- DoclingDocument,
11
- DocumentOrigin,
12
- GroupLabel,
13
- TableCell,
14
- TableData,
15
- )
16
- from lxml import etree
17
- from typing_extensions import TypedDict, override
18
-
19
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
- from docling.datamodel.base_models import InputFormat
21
- from docling.datamodel.document import InputDocument
22
-
23
- _log = logging.getLogger(__name__)
24
-
25
-
26
- class Paragraph(TypedDict):
27
- text: str
28
- headers: list[str]
29
-
30
-
31
- class Author(TypedDict):
32
- name: str
33
- affiliation_names: list[str]
34
-
35
-
36
- class Table(TypedDict):
37
- label: str
38
- caption: str
39
- content: str
40
-
41
-
42
- class FigureCaption(TypedDict):
43
- label: str
44
- caption: str
45
-
46
-
47
- class Reference(TypedDict):
48
- author_names: str
49
- title: str
50
- journal: str
51
- year: str
52
-
53
-
54
- class XMLComponents(TypedDict):
55
- title: str
56
- authors: list[Author]
57
- abstract: str
58
- paragraphs: list[Paragraph]
59
- tables: list[Table]
60
- figure_captions: list[FigureCaption]
61
- references: list[Reference]
62
-
63
-
64
- class PubMedDocumentBackend(DeclarativeDocumentBackend):
65
- """
66
- The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
67
- Achakulvisut et al., (2020).
68
- Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
69
- Journal of Open Source Software, 5(46), 1979,
70
- https://doi.org/10.21105/joss.01979
71
- """
72
-
73
- @override
74
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
75
- super().__init__(in_doc, path_or_stream)
76
- self.path_or_stream = path_or_stream
77
-
78
- # Initialize parents for the document hierarchy
79
- self.parents: dict = {}
80
-
81
- self.valid = False
82
- try:
83
- if isinstance(self.path_or_stream, BytesIO):
84
- self.path_or_stream.seek(0)
85
- self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
86
- if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
87
- self.valid = True
88
- except Exception as exc:
89
- raise RuntimeError(
90
- f"Could not initialize PubMed backend for file with hash {self.document_hash}."
91
- ) from exc
92
-
93
- @override
94
- def is_valid(self) -> bool:
95
- return self.valid
96
-
97
- @classmethod
98
- @override
99
- def supports_pagination(cls) -> bool:
100
- return False
101
-
102
- @override
103
- def unload(self):
104
- if isinstance(self.path_or_stream, BytesIO):
105
- self.path_or_stream.close()
106
- self.path_or_stream = None
107
-
108
- @classmethod
109
- @override
110
- def supported_formats(cls) -> Set[InputFormat]:
111
- return {InputFormat.XML_PUBMED}
112
-
113
- @override
114
- def convert(self) -> DoclingDocument:
115
- # Create empty document
116
- origin = DocumentOrigin(
117
- filename=self.file.name or "file",
118
- mimetype="application/xml",
119
- binary_hash=self.document_hash,
120
- )
121
- doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
122
-
123
- _log.debug("Trying to convert PubMed XML document...")
124
-
125
- # Get parsed XML components
126
- xml_components: XMLComponents = self._parse()
127
-
128
- # Add XML components to the document
129
- doc = self._populate_document(doc, xml_components)
130
- return doc
131
-
132
- def _parse_title(self) -> str:
133
- title: str = " ".join(
134
- [
135
- t.replace("\n", "")
136
- for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
137
- ]
138
- )
139
- return title
140
-
141
- def _parse_authors(self) -> list[Author]:
142
- # Get mapping between affiliation ids and names
143
- affiliation_names = []
144
- for affiliation_node in self.tree.xpath(".//aff[@id]"):
145
- affiliation_names.append(
146
- ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
147
- )
148
- affiliation_ids_names = {
149
- id: name
150
- for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
151
- }
152
-
153
- # Get author names and affiliation names
154
- authors: list[Author] = []
155
- for author_node in self.tree.xpath(
156
- './/contrib-group/contrib[@contrib-type="author"]'
157
- ):
158
- author: Author = {
159
- "name": "",
160
- "affiliation_names": [],
161
- }
162
-
163
- # Affiliation names
164
- affiliation_ids = [
165
- a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
166
- ]
167
- for id in affiliation_ids:
168
- if id in affiliation_ids_names:
169
- author["affiliation_names"].append(affiliation_ids_names[id])
170
-
171
- # Name
172
- author["name"] = (
173
- author_node.xpath("name/surname")[0].text
174
- + " "
175
- + author_node.xpath("name/given-names")[0].text
176
- )
177
-
178
- authors.append(author)
179
- return authors
180
-
181
- def _parse_abstract(self) -> str:
182
- texts = []
183
- for abstract_node in self.tree.xpath(".//abstract"):
184
- for text in abstract_node.itertext():
185
- texts.append(text.replace("\n", ""))
186
- abstract: str = "".join(texts)
187
- return abstract
188
-
189
- def _parse_main_text(self) -> list[Paragraph]:
190
- paragraphs: list[Paragraph] = []
191
- for paragraph_node in self.tree.xpath("//body//p"):
192
- # Skip captions
193
- if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
194
- continue
195
-
196
- paragraph: Paragraph = {"text": "", "headers": []}
197
-
198
- # Text
199
- paragraph["text"] = "".join(
200
- [t.replace("\n", "") for t in paragraph_node.itertext()]
201
- )
202
-
203
- # Header
204
- path = "../title"
205
- while len(paragraph_node.xpath(path)) > 0:
206
- paragraph["headers"].append(
207
- "".join(
208
- [
209
- t.replace("\n", "")
210
- for t in paragraph_node.xpath(path)[0].itertext()
211
- ]
212
- )
213
- )
214
- path = "../" + path
215
-
216
- paragraphs.append(paragraph)
217
-
218
- return paragraphs
219
-
220
- def _parse_tables(self) -> list[Table]:
221
- tables: list[Table] = []
222
- for table_node in self.tree.xpath(".//body//table-wrap"):
223
- table: Table = {"label": "", "caption": "", "content": ""}
224
-
225
- # Content
226
- if len(table_node.xpath("table")) > 0:
227
- table_content_node = table_node.xpath("table")[0]
228
- elif len(table_node.xpath("alternatives/table")) > 0:
229
- table_content_node = table_node.xpath("alternatives/table")[0]
230
- else:
231
- table_content_node = None
232
- if table_content_node != None:
233
- table["content"] = etree.tostring(table_content_node).decode("utf-8")
234
-
235
- # Caption
236
- if len(table_node.xpath("caption/p")) > 0:
237
- caption_node = table_node.xpath("caption/p")[0]
238
- elif len(table_node.xpath("caption/title")) > 0:
239
- caption_node = table_node.xpath("caption/title")[0]
240
- else:
241
- caption_node = None
242
- if caption_node != None:
243
- table["caption"] = "".join(
244
- [t.replace("\n", "") for t in caption_node.itertext()]
245
- )
246
-
247
- # Label
248
- if len(table_node.xpath("label")) > 0:
249
- table["label"] = table_node.xpath("label")[0].text
250
-
251
- tables.append(table)
252
- return tables
253
-
254
- def _parse_figure_captions(self) -> list[FigureCaption]:
255
- figure_captions: list[FigureCaption] = []
256
-
257
- if not (self.tree.xpath(".//fig")):
258
- return figure_captions
259
-
260
- for figure_node in self.tree.xpath(".//fig"):
261
- figure_caption: FigureCaption = {
262
- "caption": "",
263
- "label": "",
264
- }
265
-
266
- # Label
267
- if figure_node.xpath("label"):
268
- figure_caption["label"] = "".join(
269
- [
270
- t.replace("\n", "")
271
- for t in figure_node.xpath("label")[0].itertext()
272
- ]
273
- )
274
-
275
- # Caption
276
- if figure_node.xpath("caption"):
277
- caption = ""
278
- for caption_node in figure_node.xpath("caption")[0].getchildren():
279
- caption += (
280
- "".join([t.replace("\n", "") for t in caption_node.itertext()])
281
- + "\n"
282
- )
283
- figure_caption["caption"] = caption
284
-
285
- figure_captions.append(figure_caption)
286
-
287
- return figure_captions
288
-
289
- def _parse_references(self) -> list[Reference]:
290
- references: list[Reference] = []
291
- for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
292
- reference: Reference = {
293
- "author_names": "",
294
- "title": "",
295
- "journal": "",
296
- "year": "",
297
- }
298
- reference_node: Any = None
299
- for tag in ["mixed-citation", "element-citation", "citation"]:
300
- if len(reference_node_abs.xpath(tag)) > 0:
301
- reference_node = reference_node_abs.xpath(tag)[0]
302
- break
303
-
304
- if reference_node is None:
305
- continue
306
-
307
- if all(
308
- not (ref_type in ["citation-type", "publication-type"])
309
- for ref_type in reference_node.attrib.keys()
310
- ):
311
- continue
312
-
313
- # Author names
314
- names = []
315
- if len(reference_node.xpath("name")) > 0:
316
- for name_node in reference_node.xpath("name"):
317
- name_str = " ".join(
318
- [t.text for t in name_node.getchildren() if (t.text != None)]
319
- )
320
- names.append(name_str)
321
- elif len(reference_node.xpath("person-group")) > 0:
322
- for name_node in reference_node.xpath("person-group")[0]:
323
- name_str = (
324
- name_node.xpath("given-names")[0].text
325
- + " "
326
- + name_node.xpath("surname")[0].text
327
- )
328
- names.append(name_str)
329
- reference["author_names"] = "; ".join(names)
330
-
331
- # Title
332
- if len(reference_node.xpath("article-title")) > 0:
333
- reference["title"] = " ".join(
334
- [
335
- t.replace("\n", " ")
336
- for t in reference_node.xpath("article-title")[0].itertext()
337
- ]
338
- )
339
-
340
- # Journal
341
- if len(reference_node.xpath("source")) > 0:
342
- reference["journal"] = reference_node.xpath("source")[0].text
343
-
344
- # Year
345
- if len(reference_node.xpath("year")) > 0:
346
- reference["year"] = reference_node.xpath("year")[0].text
347
-
348
- if (
349
- not (reference_node.xpath("article-title"))
350
- and not (reference_node.xpath("journal"))
351
- and not (reference_node.xpath("year"))
352
- ):
353
- reference["title"] = reference_node.text
354
-
355
- references.append(reference)
356
- return references
357
-
358
- def _parse(self) -> XMLComponents:
359
- """Parsing PubMed document."""
360
- xml_components: XMLComponents = {
361
- "title": self._parse_title(),
362
- "authors": self._parse_authors(),
363
- "abstract": self._parse_abstract(),
364
- "paragraphs": self._parse_main_text(),
365
- "tables": self._parse_tables(),
366
- "figure_captions": self._parse_figure_captions(),
367
- "references": self._parse_references(),
368
- }
369
- return xml_components
370
-
371
- def _populate_document(
372
- self, doc: DoclingDocument, xml_components: XMLComponents
373
- ) -> DoclingDocument:
374
- self._add_title(doc, xml_components)
375
- self._add_authors(doc, xml_components)
376
- self._add_abstract(doc, xml_components)
377
- self._add_main_text(doc, xml_components)
378
-
379
- if xml_components["tables"]:
380
- self._add_tables(doc, xml_components)
381
-
382
- if xml_components["figure_captions"]:
383
- self._add_figure_captions(doc, xml_components)
384
-
385
- self._add_references(doc, xml_components)
386
- return doc
387
-
388
- def _add_figure_captions(
389
- self, doc: DoclingDocument, xml_components: XMLComponents
390
- ) -> None:
391
- self.parents["Figures"] = doc.add_heading(
392
- parent=self.parents["Title"], text="Figures"
393
- )
394
- for figure_caption_xml_component in xml_components["figure_captions"]:
395
- figure_caption_text = (
396
- figure_caption_xml_component["label"]
397
- + ": "
398
- + figure_caption_xml_component["caption"].strip()
399
- )
400
- fig_caption = doc.add_text(
401
- label=DocItemLabel.CAPTION, text=figure_caption_text
402
- )
403
- doc.add_picture(
404
- parent=self.parents["Figures"],
405
- caption=fig_caption,
406
- )
407
- return
408
-
409
- def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
410
- self.parents["Title"] = doc.add_text(
411
- parent=None,
412
- text=xml_components["title"],
413
- label=DocItemLabel.TITLE,
414
- )
415
- return
416
-
417
- def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
418
- authors_affiliations: list = []
419
- for author in xml_components["authors"]:
420
- authors_affiliations.append(author["name"])
421
- authors_affiliations.append(", ".join(author["affiliation_names"]))
422
- authors_affiliations_str = "; ".join(authors_affiliations)
423
-
424
- doc.add_text(
425
- parent=self.parents["Title"],
426
- text=authors_affiliations_str,
427
- label=DocItemLabel.PARAGRAPH,
428
- )
429
- return
430
-
431
- def _add_abstract(
432
- self, doc: DoclingDocument, xml_components: XMLComponents
433
- ) -> None:
434
- abstract_text: str = xml_components["abstract"]
435
- self.parents["Abstract"] = doc.add_heading(
436
- parent=self.parents["Title"], text="Abstract"
437
- )
438
- doc.add_text(
439
- parent=self.parents["Abstract"],
440
- text=abstract_text,
441
- label=DocItemLabel.TEXT,
442
- )
443
- return
444
-
445
- def _add_main_text(
446
- self, doc: DoclingDocument, xml_components: XMLComponents
447
- ) -> None:
448
- added_headers: list = []
449
- for paragraph in xml_components["paragraphs"]:
450
- if not (paragraph["headers"]):
451
- continue
452
-
453
- # Header
454
- for i, header in enumerate(reversed(paragraph["headers"])):
455
- if header in added_headers:
456
- continue
457
- added_headers.append(header)
458
-
459
- if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
460
- i - 1
461
- ] in self.parents:
462
- parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
463
- else:
464
- parent = self.parents["Title"]
465
-
466
- self.parents[header] = doc.add_heading(parent=parent, text=header)
467
-
468
- # Paragraph text
469
- if paragraph["headers"][0] in self.parents:
470
- parent = self.parents[paragraph["headers"][0]]
471
- else:
472
- parent = self.parents["Title"]
473
-
474
- doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
475
- return
476
-
477
- def _add_references(
478
- self, doc: DoclingDocument, xml_components: XMLComponents
479
- ) -> None:
480
- self.parents["References"] = doc.add_heading(
481
- parent=self.parents["Title"], text="References"
482
- )
483
- current_list = doc.add_group(
484
- parent=self.parents["References"], label=GroupLabel.LIST, name="list"
485
- )
486
- for reference in xml_components["references"]:
487
- reference_text: str = ""
488
- if reference["author_names"]:
489
- reference_text += reference["author_names"] + ". "
490
-
491
- if reference["title"]:
492
- reference_text += reference["title"]
493
- if reference["title"][-1] != ".":
494
- reference_text += "."
495
- reference_text += " "
496
-
497
- if reference["journal"]:
498
- reference_text += reference["journal"]
499
-
500
- if reference["year"]:
501
- reference_text += " (" + reference["year"] + ")"
502
-
503
- if not (reference_text):
504
- _log.debug(f"Skipping reference for: {str(self.file)}")
505
- continue
506
-
507
- doc.add_list_item(
508
- text=reference_text, enumerated=False, parent=current_list
509
- )
510
- return
511
-
512
- def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
513
- self.parents["Tables"] = doc.add_heading(
514
- parent=self.parents["Title"], text="Tables"
515
- )
516
- for table_xml_component in xml_components["tables"]:
517
- try:
518
- self._add_table(doc, table_xml_component)
519
- except Exception as e:
520
- _log.debug(f"Skipping unsupported table for: {str(self.file)}")
521
- pass
522
- return
523
-
524
- def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
525
- soup = BeautifulSoup(table_xml_component["content"], "html.parser")
526
- table_tag = soup.find("table")
527
-
528
- nested_tables = table_tag.find("table")
529
- if nested_tables:
530
- _log.debug(f"Skipping nested table for: {str(self.file)}")
531
- return
532
-
533
- # Count the number of rows (number of <tr> elements)
534
- num_rows = len(table_tag.find_all("tr"))
535
-
536
- # Find the number of columns (taking into account colspan)
537
- num_cols = 0
538
- for row in table_tag.find_all("tr"):
539
- col_count = 0
540
- for cell in row.find_all(["td", "th"]):
541
- colspan = int(cell.get("colspan", 1))
542
- col_count += colspan
543
- num_cols = max(num_cols, col_count)
544
-
545
- grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
546
-
547
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
548
-
549
- # Iterate over the rows in the table
550
- for row_idx, row in enumerate(table_tag.find_all("tr")):
551
- # For each row, find all the column cells (both <td> and <th>)
552
- cells = row.find_all(["td", "th"])
553
-
554
- # Check if each cell in the row is a header -> means it is a column header
555
- col_header = True
556
- for j, html_cell in enumerate(cells):
557
- if html_cell.name == "td":
558
- col_header = False
559
-
560
- # Extract and print the text content of each cell
561
- col_idx = 0
562
- for _, html_cell in enumerate(cells):
563
- text = html_cell.text
564
-
565
- col_span = int(html_cell.get("colspan", 1))
566
- row_span = int(html_cell.get("rowspan", 1))
567
-
568
- while grid[row_idx][col_idx] != None:
569
- col_idx += 1
570
- for r in range(row_span):
571
- for c in range(col_span):
572
- grid[row_idx + r][col_idx + c] = text
573
-
574
- cell = TableCell(
575
- text=text,
576
- row_span=row_span,
577
- col_span=col_span,
578
- start_row_offset_idx=row_idx,
579
- end_row_offset_idx=row_idx + row_span,
580
- start_col_offset_idx=col_idx,
581
- end_col_offset_idx=col_idx + col_span,
582
- col_header=col_header,
583
- row_header=((not col_header) and html_cell.name == "th"),
584
- )
585
- data.table_cells.append(cell)
586
-
587
- table_caption = doc.add_text(
588
- label=DocItemLabel.CAPTION,
589
- text=table_xml_component["label"] + ": " + table_xml_component["caption"],
590
- )
591
- doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
592
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/uspto_backend.py DELETED
@@ -1,1888 +0,0 @@
1
- """Backend to parse patents from the United States Patent Office (USPTO).
2
-
3
- The parsers included in this module can handle patent grants pubished since 1976 and
4
- patent applications since 2001.
5
- The original files can be found in https://bulkdata.uspto.gov.
6
- """
7
-
8
- import html
9
- import logging
10
- import re
11
- import xml.sax
12
- import xml.sax.xmlreader
13
- from abc import ABC, abstractmethod
14
- from enum import Enum, unique
15
- from io import BytesIO
16
- from pathlib import Path
17
- from typing import Any, Final, Optional, Union
18
-
19
- from bs4 import BeautifulSoup, Tag
20
- from docling_core.types.doc import (
21
- DocItem,
22
- DocItemLabel,
23
- DoclingDocument,
24
- DocumentOrigin,
25
- TableCell,
26
- TableData,
27
- TextItem,
28
- )
29
- from docling_core.types.doc.document import LevelNumber
30
- from pydantic import NonNegativeInt
31
- from typing_extensions import Self, TypedDict, override
32
-
33
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
34
- from docling.datamodel.base_models import InputFormat
35
- from docling.datamodel.document import InputDocument
36
-
37
- _log = logging.getLogger(__name__)
38
-
39
- XML_DECLARATION: Final = '<?xml version="1.0" encoding="UTF-8"?>'
40
-
41
-
42
- @unique
43
- class PatentHeading(Enum):
44
- """Text of docling headings for tagged sections in USPTO patent documents."""
45
-
46
- ABSTRACT = "ABSTRACT", 2
47
- CLAIMS = "CLAIMS", 2
48
-
49
- @override
50
- def __new__(cls, value: str, _) -> Self:
51
- obj = object.__new__(cls)
52
- obj._value_ = value
53
- return obj
54
-
55
- @override
56
- def __init__(self, _, level: LevelNumber) -> None:
57
- self.level: LevelNumber = level
58
-
59
-
60
- class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
61
- @override
62
- def __init__(
63
- self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
64
- ) -> None:
65
- super().__init__(in_doc, path_or_stream)
66
-
67
- self.patent_content: str = ""
68
- self.parser: Optional[PatentUspto] = None
69
-
70
- try:
71
- if isinstance(self.path_or_stream, BytesIO):
72
- while line := self.path_or_stream.readline().decode("utf-8"):
73
- if line.startswith("<!DOCTYPE") or line == "PATN\n":
74
- self._set_parser(line)
75
- self.patent_content += line
76
- elif isinstance(self.path_or_stream, Path):
77
- with open(self.path_or_stream, encoding="utf-8") as file_obj:
78
- while line := file_obj.readline():
79
- if line.startswith("<!DOCTYPE") or line == "PATN\n":
80
- self._set_parser(line)
81
- self.patent_content += line
82
- except Exception as exc:
83
- raise RuntimeError(
84
- f"Could not initialize USPTO backend for file with hash {self.document_hash}."
85
- ) from exc
86
-
87
- def _set_parser(self, doctype: str) -> None:
88
- doctype_line = doctype.lower()
89
- if doctype == "PATN\n":
90
- self.parser = PatentUsptoGrantAps()
91
- elif "us-patent-application-v4" in doctype_line:
92
- self.parser = PatentUsptoIce()
93
- elif "us-patent-grant-v4" in doctype_line:
94
- self.parser = PatentUsptoIce()
95
- elif "us-grant-025" in doctype_line:
96
- self.parser = PatentUsptoGrantV2()
97
- elif all(
98
- item in doctype_line
99
- for item in ("patent-application-publication", "pap-v1")
100
- ):
101
- self.parser = PatentUsptoAppV1()
102
- else:
103
- self.parser = None
104
-
105
- @override
106
- def is_valid(self) -> bool:
107
- return bool(self.patent_content) and bool(self.parser)
108
-
109
- @classmethod
110
- @override
111
- def supports_pagination(cls) -> bool:
112
- return False
113
-
114
- @override
115
- def unload(self) -> None:
116
- return
117
-
118
- @classmethod
119
- @override
120
- def supported_formats(cls) -> set[InputFormat]:
121
- return {InputFormat.XML_USPTO}
122
-
123
- @override
124
- def convert(self) -> DoclingDocument:
125
-
126
- if self.parser is not None:
127
- doc = self.parser.parse(self.patent_content)
128
- if doc is None:
129
- raise RuntimeError(
130
- f"Failed to convert doc (hash={self.document_hash}, "
131
- f"name={self.file.name})."
132
- )
133
- doc.name = self.file.name or "file"
134
- mime_type = (
135
- "text/plain"
136
- if isinstance(self.parser, PatentUsptoGrantAps)
137
- else "application/xml"
138
- )
139
- doc.origin = DocumentOrigin(
140
- mimetype=mime_type,
141
- binary_hash=self.document_hash,
142
- filename=self.file.name or "file",
143
- )
144
-
145
- return doc
146
- else:
147
- raise RuntimeError(
148
- f"Cannot convert doc (hash={self.document_hash}, "
149
- f"name={self.file.name}) because the backend failed to init."
150
- )
151
-
152
-
153
- class PatentUspto(ABC):
154
- """Parser of patent documents from the US Patent Office."""
155
-
156
- @abstractmethod
157
- def parse(self, patent_content: str) -> Optional[DoclingDocument]:
158
- """Parse a USPTO patent.
159
-
160
- Parameters:
161
- patent_content: The content of a single patent in a USPTO file.
162
-
163
- Returns:
164
- The patent parsed as a docling document.
165
- """
166
- pass
167
-
168
-
169
- class PatentUsptoIce(PatentUspto):
170
- """Parser of patent documents from the US Patent Office (ICE).
171
-
172
- The compatible formats are:
173
- - Patent Grant Full Text Data/XML Version 4.x ICE (from January 2005)
174
- - Patent Application Full Text Data/XML Version 4.x ICE (from January 2005)
175
- """
176
-
177
- def __init__(self) -> None:
178
- """Build an instance of PatentUsptoIce class."""
179
- self.handler = PatentUsptoIce.PatentHandler()
180
- self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
181
-
182
- def parse(self, patent_content: str) -> Optional[DoclingDocument]:
183
- try:
184
- xml.sax.parseString(patent_content, self.handler)
185
- except xml.sax._exceptions.SAXParseException as exc_sax:
186
- _log.error(f"Error in parsing USPTO document: {exc_sax}")
187
-
188
- return None
189
-
190
- doc = self.handler.doc
191
- if doc:
192
- raw_tables = re.findall(self.pattern, patent_content)
193
- parsed_tables: list[TableData] = []
194
- _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
195
- for table in raw_tables:
196
- table_parser = XmlTable(XML_DECLARATION + "\n" + table)
197
- try:
198
- table_data = table_parser.parse()
199
- if table_data:
200
- parsed_tables.append(table_data)
201
- except Exception as exc_table:
202
- _log.error(f"Error in parsing USPTO tables: {exc_table}")
203
- if len(parsed_tables) != len(doc.tables):
204
- _log.error(
205
- f"Number of referenced ({len(doc.tables)}) and parsed "
206
- f"({len(parsed_tables)}) tables differ."
207
- )
208
- else:
209
- for idx, item in enumerate(parsed_tables):
210
- doc.tables[idx].data = item
211
-
212
- return doc
213
-
214
- class PatentHandler(xml.sax.handler.ContentHandler):
215
- """SAX ContentHandler for patent documents."""
216
-
217
- APP_DOC_ELEMENT: Final = "us-patent-application"
218
- GRANT_DOC_ELEMENT: Final = "us-patent-grant"
219
-
220
- @unique
221
- class Element(Enum):
222
- """Represents an element of interest in the patent application document."""
223
-
224
- ABSTRACT = "abstract", True
225
- TITLE = "invention-title", True
226
- CLAIMS = "claims", False
227
- CLAIM = "claim", False
228
- CLAIM_TEXT = "claim-text", True
229
- PARAGRAPH = "p", True
230
- HEADING = "heading", True
231
- DESCRIPTION = "description", False
232
- TABLE = "table", False # to track its position, without text
233
- DRAWINGS = "description-of-drawings", True
234
- STYLE_SUPERSCRIPT = "sup", True
235
- STYLE_SUBSCRIPT = "sub", True
236
- MATHS = "maths", False # to avoid keeping formulas
237
-
238
- @override
239
- def __new__(cls, value: str, _) -> Self:
240
- obj = object.__new__(cls)
241
- obj._value_ = value
242
- return obj
243
-
244
- @override
245
- def __init__(self, _, is_text: bool) -> None:
246
- self.is_text: bool = is_text
247
-
248
- @override
249
- def __init__(self) -> None:
250
- """Build an instance of the patent handler."""
251
- # Current patent being parsed
252
- self.doc: Optional[DoclingDocument] = None
253
- # Keep track of docling hierarchy level
254
- self.level: LevelNumber = 1
255
- # Keep track of docling parents by level
256
- self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
257
- # Content to retain for the current patent
258
- self.property: list[str]
259
- self.claim: str
260
- self.claims: list[str]
261
- self.abstract: str
262
- self.text: str
263
- self._clean_data()
264
- # To handle mathematical styling
265
- self.style_html = HtmlEntity()
266
-
267
- @override
268
- def startElement(self, tag, attributes): # noqa: N802
269
- """Signal the start of an element.
270
-
271
- Args:
272
- tag: The element tag.
273
- attributes: The element attributes.
274
- """
275
- if tag in (
276
- self.APP_DOC_ELEMENT,
277
- self.GRANT_DOC_ELEMENT,
278
- ):
279
- self.doc = DoclingDocument(name="file")
280
- self.text = ""
281
- self._start_registered_elements(tag, attributes)
282
-
283
- @override
284
- def skippedEntity(self, name): # noqa: N802
285
- """Receive notification of a skipped entity.
286
-
287
- HTML entities will be skipped by the parser. This method will unescape them
288
- and add them to the text.
289
-
290
- Args:
291
- name: Entity name.
292
- """
293
- if self.property:
294
- elm_val = self.property[-1]
295
- element = self.Element(elm_val)
296
- if element.is_text:
297
- escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
298
- unescaped = html.unescape(escaped)
299
- if unescaped == escaped:
300
- _log.debug(f"Unrecognized HTML entity: {name}")
301
- return
302
-
303
- if element in (
304
- self.Element.STYLE_SUPERSCRIPT,
305
- self.Element.STYLE_SUBSCRIPT,
306
- ):
307
- # superscripts and subscripts need to be under text elements
308
- if len(self.property) < 2:
309
- return
310
- parent_val = self.property[-2]
311
- parent = self.Element(parent_val)
312
- if parent.is_text:
313
- self.text += self._apply_style(unescaped, elm_val)
314
- else:
315
- self.text += unescaped
316
-
317
- @override
318
- def endElement(self, tag): # noqa: N802
319
- """Signal the end of an element.
320
-
321
- Args:
322
- tag: The element tag.
323
- """
324
- if tag in (
325
- self.APP_DOC_ELEMENT,
326
- self.GRANT_DOC_ELEMENT,
327
- ):
328
- self._clean_data()
329
- self._end_registered_element(tag)
330
-
331
- @override
332
- def characters(self, content):
333
- """Receive notification of character data.
334
-
335
- Args:
336
- content: Data reported by the handler.
337
- """
338
- if self.property:
339
- elm_val = self.property[-1]
340
- element = self.Element(elm_val)
341
- if element.is_text:
342
- if element in (
343
- self.Element.STYLE_SUPERSCRIPT,
344
- self.Element.STYLE_SUBSCRIPT,
345
- ):
346
- # superscripts and subscripts need to be under text elements
347
- if len(self.property) < 2:
348
- return
349
- parent_val = self.property[-2]
350
- parent = self.Element(parent_val)
351
- if parent.is_text:
352
- self.text += self._apply_style(content, elm_val)
353
- else:
354
- self.text += content
355
-
356
- def _start_registered_elements(
357
- self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
358
- ) -> None:
359
- if tag in [member.value for member in self.Element]:
360
- # special case for claims: claim lines may start before the
361
- # previous one is closed
362
- if (
363
- tag == self.Element.CLAIM_TEXT.value
364
- and self.property
365
- and self.property[-1] == tag
366
- and self.text.strip()
367
- ):
368
- self.claim += " " + self.text.strip()
369
- self.text = ""
370
- elif tag == self.Element.HEADING.value:
371
- level_attr: str = attributes.get("level", "")
372
- new_level: int = int(level_attr) if level_attr.isnumeric() else 1
373
- max_level = min(self.parents.keys())
374
- # increase heading level with 1 for title, if any
375
- self.level = (
376
- new_level + 1 if (new_level + 1) in self.parents else max_level
377
- )
378
- self.property.append(tag)
379
-
380
- def _end_registered_element(self, tag: str) -> None:
381
- if tag in [item.value for item in self.Element] and self.property:
382
- current_tag = self.property.pop()
383
- self._add_property(current_tag, self.text.strip())
384
-
385
- def _add_property(self, name: str, text: str) -> None:
386
- if not name or not self.doc:
387
- return
388
-
389
- if name == self.Element.TITLE.value:
390
- if text:
391
- self.parents[self.level + 1] = self.doc.add_title(
392
- parent=self.parents[self.level],
393
- text=text,
394
- )
395
- self.level += 1
396
- self.text = ""
397
-
398
- elif name == self.Element.ABSTRACT.value:
399
- if self.abstract:
400
- heading_text = PatentHeading.ABSTRACT.value
401
- heading_level = (
402
- PatentHeading.ABSTRACT.level
403
- if PatentHeading.ABSTRACT.level in self.parents
404
- else 1
405
- )
406
- abstract_item = self.doc.add_heading(
407
- heading_text,
408
- level=heading_level,
409
- parent=self.parents[heading_level],
410
- )
411
- self.doc.add_text(
412
- label=DocItemLabel.PARAGRAPH,
413
- text=self.abstract,
414
- parent=abstract_item,
415
- )
416
-
417
- elif name == self.Element.CLAIM_TEXT.value:
418
- text = re.sub("\\s+", " ", text).strip()
419
- if text:
420
- self.claim += " " + text
421
- self.text = ""
422
-
423
- elif name == self.Element.CLAIM.value and self.claim:
424
- self.claims.append(self.claim.strip())
425
- self.claim = ""
426
-
427
- elif name == self.Element.CLAIMS.value and self.claims:
428
- heading_text = PatentHeading.CLAIMS.value
429
- heading_level = (
430
- PatentHeading.CLAIMS.level
431
- if PatentHeading.CLAIMS.level in self.parents
432
- else 1
433
- )
434
- claims_item = self.doc.add_heading(
435
- heading_text,
436
- level=heading_level,
437
- parent=self.parents[heading_level],
438
- )
439
- for text in self.claims:
440
- self.doc.add_text(
441
- label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
442
- )
443
-
444
- elif name == self.Element.PARAGRAPH.value and text:
445
- # remmove blank spaces added in paragraphs
446
- text = re.sub("\\s+", " ", text)
447
- if self.Element.ABSTRACT.value in self.property:
448
- self.abstract = (
449
- (self.abstract + " " + text) if self.abstract else text
450
- )
451
- else:
452
- self.doc.add_text(
453
- label=DocItemLabel.PARAGRAPH,
454
- text=text,
455
- parent=self.parents[self.level],
456
- )
457
- self.text = ""
458
-
459
- elif name == self.Element.HEADING.value and text:
460
- self.parents[self.level + 1] = self.doc.add_heading(
461
- text=text,
462
- level=self.level,
463
- parent=self.parents[self.level],
464
- )
465
- self.level += 1
466
- self.text = ""
467
-
468
- elif name == self.Element.TABLE.value:
469
- # set an empty table as placeholder
470
- empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
471
- self.doc.add_table(
472
- data=empty_table,
473
- parent=self.parents[self.level],
474
- )
475
-
476
- def _apply_style(self, text: str, style_tag: str) -> str:
477
- """Apply an HTML style to text.
478
-
479
- Args:
480
- text: A string containing plain text.
481
- style_tag: An HTML tag name for styling text. If the tag name is not
482
- recognized as one of the supported styles, the method will return
483
- the original `text`.
484
-
485
- Returns:
486
- A string after applying the style.
487
- """
488
- formatted = text
489
-
490
- if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
491
- formatted = html.unescape(self.style_html.get_superscript(text))
492
- elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
493
- formatted = html.unescape(self.style_html.get_subscript(text))
494
-
495
- return formatted
496
-
497
- def _clean_data(self) -> None:
498
- """Reset the variables from stream data."""
499
- self.property = []
500
- self.claim = ""
501
- self.claims = []
502
- self.abstract = ""
503
-
504
-
505
- class PatentUsptoGrantV2(PatentUspto):
506
- """Parser of patent documents from the US Patent Office (grants v2.5).
507
-
508
- The compatible format is:
509
- - Patent Grant Full Text Data/XML Version 2.5 (from January 2002 till December 2004)
510
- """
511
-
512
- @override
513
- def __init__(self) -> None:
514
- """Build an instance of PatentUsptoGrantV2 class."""
515
- self.handler = PatentUsptoGrantV2.PatentHandler()
516
- self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
517
-
518
- @override
519
- def parse(self, patent_content: str) -> Optional[DoclingDocument]:
520
- try:
521
- xml.sax.parseString(patent_content, self.handler)
522
- except xml.sax._exceptions.SAXParseException as exc_sax:
523
- _log.error(f"Error in parsing USPTO document: {exc_sax}")
524
-
525
- return None
526
-
527
- doc = self.handler.doc
528
- if doc:
529
- raw_tables = re.findall(self.pattern, patent_content)
530
- parsed_tables: list[TableData] = []
531
- _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
532
- for table in raw_tables:
533
- table_parser = XmlTable(XML_DECLARATION + "\n" + table)
534
- try:
535
- table_data = table_parser.parse()
536
- if table_data:
537
- parsed_tables.append(table_data)
538
- except Exception as exc_table:
539
- _log.error(f"Error in parsing USPTO tables: {exc_table}")
540
- if len(parsed_tables) != len(doc.tables):
541
- _log.error(
542
- f"Number of referenced ({len(doc.tables)}) and parsed "
543
- f"({len(parsed_tables)}) tables differ."
544
- )
545
- else:
546
- for idx, item in enumerate(parsed_tables):
547
- doc.tables[idx].data = item
548
-
549
- return doc
550
-
551
- class PatentHandler(xml.sax.handler.ContentHandler):
552
- """SAX ContentHandler for patent documents."""
553
-
554
- GRANT_DOC_ELEMENT: Final = "PATDOC"
555
- CLAIM_STATEMENT: Final = "What is claimed is:"
556
-
557
- @unique
558
- class Element(Enum):
559
- """Represents an element of interest in the patent application document."""
560
-
561
- PDAT = "PDAT", True # any type of data
562
- ABSTRACT = ("SDOAB", False)
563
- SDOCL = ("SDOCL", False)
564
- TITLE = ("B540", False)
565
- CLAIMS = ("CL", False)
566
- CLAIM = ("CLM", False)
567
- PARAGRAPH = ("PARA", True)
568
- HEADING = ("H", True)
569
- DRAWINGS = ("DRWDESC", False)
570
- STYLE_SUPERSCRIPT = ("SP", False)
571
- STYLE_SUBSCRIPT = ("SB", False)
572
- STYLE_ITALIC = ("ITALIC", False)
573
- CWU = ("CWU", False) # avoid tables, chemicals, formulas
574
- TABLE = ("table", False) # to keep track of table positions
575
-
576
- @override
577
- def __new__(cls, value: str, _) -> Self:
578
- obj = object.__new__(cls)
579
- obj._value_ = value
580
- return obj
581
-
582
- @override
583
- def __init__(self, _, is_text: bool) -> None:
584
- self.is_text: bool = is_text
585
-
586
- @override
587
- def __init__(self) -> None:
588
- """Build an instance of the patent handler."""
589
- # Current patent being parsed
590
- self.doc: Optional[DoclingDocument] = None
591
- # Keep track of docling hierarchy level
592
- self.level: LevelNumber = 1
593
- # Keep track of docling parents by level
594
- self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
595
- # Content to retain for the current patent
596
- self.property: list[str]
597
- self.claim: str
598
- self.claims: list[str]
599
- self.paragraph: str
600
- self.abstract: str
601
- self._clean_data()
602
- # To handle mathematical styling
603
- self.style_html = HtmlEntity()
604
-
605
- @override
606
- def startElement(self, tag, attributes): # noqa: N802
607
- """Signal the start of an element.
608
-
609
- Args:
610
- tag: The element tag.
611
- attributes: The element attributes.
612
- """
613
- if tag == self.GRANT_DOC_ELEMENT:
614
- self.doc = DoclingDocument(name="file")
615
- self.text = ""
616
- self._start_registered_elements(tag, attributes)
617
-
618
- @override
619
- def skippedEntity(self, name): # noqa: N802
620
- """Receive notification of a skipped entity.
621
-
622
- HTML entities will be skipped by the parser. This method will unescape them
623
- and add them to the text.
624
-
625
- Args:
626
- name: Entity name.
627
- """
628
- if self.property:
629
- elm_val = self.property[-1]
630
- element = self.Element(elm_val)
631
- if element.is_text:
632
- escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
633
- unescaped = html.unescape(escaped)
634
- if unescaped == escaped:
635
- logging.debug("Unrecognized HTML entity: " + name)
636
- return
637
-
638
- if element in (
639
- self.Element.STYLE_SUPERSCRIPT,
640
- self.Element.STYLE_SUBSCRIPT,
641
- ):
642
- # superscripts and subscripts need to be under text elements
643
- if len(self.property) < 2:
644
- return
645
- parent_val = self.property[-2]
646
- parent = self.Element(parent_val)
647
- if parent.is_text:
648
- self.text += self._apply_style(unescaped, elm_val)
649
- else:
650
- self.text += unescaped
651
-
652
- @override
653
- def endElement(self, tag): # noqa: N802
654
- """Signal the end of an element.
655
-
656
- Args:
657
- tag: The element tag.
658
- """
659
- if tag == self.GRANT_DOC_ELEMENT:
660
- self._clean_data()
661
- self._end_registered_element(tag)
662
-
663
- @override
664
- def characters(self, content):
665
- """Receive notification of character data.
666
-
667
- Args:
668
- content: Data reported by the handler.
669
- """
670
- if self.property:
671
- elm_val = self.property[-1]
672
- element = self.Element(elm_val)
673
- if element.is_text:
674
- if element in (
675
- self.Element.STYLE_SUPERSCRIPT,
676
- self.Element.STYLE_SUBSCRIPT,
677
- ):
678
- # superscripts and subscripts need to be under text elements
679
- if len(self.property) < 2:
680
- return
681
- parent_val = self.property[-2]
682
- parent = self.Element(parent_val)
683
- if parent.is_text:
684
- self.text += self._apply_style(content, elm_val)
685
- else:
686
- self.text += content
687
-
688
- def _start_registered_elements(
689
- self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
690
- ) -> None:
691
- if tag in [member.value for member in self.Element]:
692
- if (
693
- tag == self.Element.HEADING.value
694
- and not self.Element.SDOCL.value in self.property
695
- ):
696
- level_attr: str = attributes.get("LVL", "")
697
- new_level: int = int(level_attr) if level_attr.isnumeric() else 1
698
- max_level = min(self.parents.keys())
699
- # increase heading level with 1 for title, if any
700
- self.level = (
701
- new_level + 1 if (new_level + 1) in self.parents else max_level
702
- )
703
- self.property.append(tag)
704
-
705
- def _end_registered_element(self, tag: str) -> None:
706
- if tag in [elm.value for elm in self.Element] and self.property:
707
- current_tag = self.property.pop()
708
- self._add_property(current_tag, self.text)
709
-
710
- def _add_property(self, name: str, text: str) -> None:
711
- if not name or not self.doc:
712
- return
713
- if name == self.Element.PDAT.value and text:
714
- if not self.property:
715
- self.text = ""
716
- return
717
-
718
- wrapper = self.property[-1]
719
- text = self._apply_style(text, wrapper)
720
-
721
- if self.Element.TITLE.value in self.property and text.strip():
722
- title = text.strip()
723
- self.parents[self.level + 1] = self.doc.add_title(
724
- parent=self.parents[self.level],
725
- text=title,
726
- )
727
- self.level += 1
728
-
729
- elif self.Element.ABSTRACT.value in self.property:
730
- self.abstract += text
731
-
732
- elif self.Element.CLAIM.value in self.property:
733
- self.claim += text
734
-
735
- # Paragraph text not in claims or abstract
736
- elif (
737
- self.Element.PARAGRAPH.value in self.property
738
- and self.Element.CLAIM.value not in self.property
739
- and self.Element.ABSTRACT.value not in self.property
740
- ):
741
- self.paragraph += text
742
-
743
- # headers except claims statement
744
- elif (
745
- self.Element.HEADING.value in self.property
746
- and not self.Element.SDOCL.value in self.property
747
- and text.strip()
748
- ):
749
- self.parents[self.level + 1] = self.doc.add_heading(
750
- text=text.strip(),
751
- level=self.level,
752
- parent=self.parents[self.level],
753
- )
754
- self.level += 1
755
-
756
- self.text = ""
757
-
758
- elif name == self.Element.CLAIM.value and self.claim.strip():
759
- self.claims.append(self.claim.strip())
760
- self.claim = ""
761
-
762
- elif name == self.Element.CLAIMS.value and self.claims:
763
- heading_text = PatentHeading.CLAIMS.value
764
- heading_level = (
765
- PatentHeading.CLAIMS.level
766
- if PatentHeading.CLAIMS.level in self.parents
767
- else 1
768
- )
769
- claims_item = self.doc.add_heading(
770
- heading_text,
771
- level=heading_level,
772
- parent=self.parents[heading_level],
773
- )
774
- for text in self.claims:
775
- self.doc.add_text(
776
- label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
777
- )
778
-
779
- elif name == self.Element.ABSTRACT.value and self.abstract.strip():
780
- abstract = self.abstract.strip()
781
- heading_text = PatentHeading.ABSTRACT.value
782
- heading_level = (
783
- PatentHeading.ABSTRACT.level
784
- if PatentHeading.ABSTRACT.level in self.parents
785
- else 1
786
- )
787
- abstract_item = self.doc.add_heading(
788
- heading_text,
789
- level=heading_level,
790
- parent=self.parents[heading_level],
791
- )
792
- self.doc.add_text(
793
- label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
794
- )
795
-
796
- elif name == self.Element.PARAGRAPH.value:
797
- paragraph = self.paragraph.strip()
798
- if paragraph and self.Element.CLAIM.value not in self.property:
799
- self.doc.add_text(
800
- label=DocItemLabel.PARAGRAPH,
801
- text=paragraph,
802
- parent=self.parents[self.level],
803
- )
804
- elif self.Element.CLAIM.value in self.property:
805
- # we may need a space after a paragraph in claim text
806
- self.claim += " "
807
- self.paragraph = ""
808
-
809
- elif name == self.Element.TABLE.value:
810
- # set an empty table as placeholder
811
- empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
812
- self.doc.add_table(
813
- data=empty_table,
814
- parent=self.parents[self.level],
815
- )
816
-
817
- def _apply_style(self, text: str, style_tag: str) -> str:
818
- """Apply an HTML style to text.
819
-
820
- Args:
821
- text: A string containing plain text.
822
- style_tag: An HTML tag name for styling text. If the tag name is not
823
- recognized as one of the supported styles, the method will return
824
- the original `text`.
825
-
826
- Returns:
827
- A string after applying the style.
828
- """
829
- formatted = text
830
-
831
- if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
832
- formatted = html.unescape(self.style_html.get_superscript(text))
833
- elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
834
- formatted = html.unescape(self.style_html.get_subscript(text))
835
- elif style_tag == self.Element.STYLE_ITALIC.value:
836
- formatted = html.unescape(self.style_html.get_math_italic(text))
837
-
838
- return formatted
839
-
840
- def _clean_data(self) -> None:
841
- """Reset the variables from stream data."""
842
- self.text = ""
843
- self.property = []
844
- self.claim = ""
845
- self.claims = []
846
- self.paragraph = ""
847
- self.abstract = ""
848
-
849
-
850
- class PatentUsptoGrantAps(PatentUspto):
851
- """Parser of patents documents from the US Patent Office (grants APS).
852
-
853
- The compatible format is:
854
- - Patent Grant Full Text Data/APS (from January 1976 till December 2001)
855
- """
856
-
857
- @unique
858
- class Section(Enum):
859
- """Represent a section in a patent APS document."""
860
-
861
- ABSTRACT = "ABST"
862
- SUMMARY = "BSUM"
863
- DETAILS = "DETD"
864
- CLAIMS = "CLMS"
865
- DRAWINGS = "DRWD"
866
-
867
- @unique
868
- class Field(Enum):
869
- """Represent a field in a patent APS document."""
870
-
871
- DOC_NUMBER = "WKU"
872
- TITLE = "TTL"
873
- PARAGRAPH = "PAR"
874
- PARAGRAPH_1 = "PA1"
875
- PARAGRAPH_2 = "PA2"
876
- PARAGRAPH_3 = "PA3"
877
- TEXT = "PAL"
878
- CAPTION = "PAC"
879
- NUMBER = "NUM"
880
- NAME = "NAM"
881
- IPC = "ICL"
882
- ISSUED = "ISD"
883
- FILED = "APD"
884
- PATENT_NUMBER = "PNO"
885
- APPLICATION_NUMBER = "APN"
886
- APPLICATION_TYPE = "APT"
887
- COUNTRY = "CNT"
888
-
889
- @override
890
- def __init__(self) -> None:
891
- """Build an instance of PatentUsptoGrantAps class."""
892
- self.doc: Optional[DoclingDocument] = None
893
- # Keep track of docling hierarchy level
894
- self.level: LevelNumber = 1
895
- # Keep track of docling parents by level
896
- self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
897
-
898
- def get_last_text_item(self) -> Optional[TextItem]:
899
- """Get the last text item at the current document level.
900
-
901
- Returns:
902
- The text item or None, if the current level parent has no children."""
903
- if self.doc:
904
- parent = self.parents[self.level]
905
- children = parent.children if parent is not None else []
906
- else:
907
- return None
908
- text_list: list[TextItem] = [
909
- item
910
- for item in self.doc.texts
911
- if isinstance(item, TextItem) and item.get_ref() in children
912
- ]
913
-
914
- if text_list:
915
- return text_list[-1]
916
- else:
917
- return None
918
-
919
- def store_section(self, section: str) -> None:
920
- """Store the section heading in the docling document.
921
-
922
- Only the predefined sections from PatentHeading will be handled.
923
- The other sections are created by the Field.CAPTION field.
924
-
925
- Args:
926
- section: A patent section name."""
927
- heading: PatentHeading
928
- if self.doc is None:
929
- return
930
- elif section == self.Section.ABSTRACT.value:
931
- heading = PatentHeading.ABSTRACT
932
- elif section == self.Section.CLAIMS.value:
933
- heading = PatentHeading.CLAIMS
934
- else:
935
- return None
936
-
937
- self.level = heading.level if heading.level in self.parents else 1
938
- self.parents[self.level + 1] = self.doc.add_heading(
939
- heading.value,
940
- level=self.level,
941
- parent=self.parents[self.level],
942
- )
943
- self.level += 1
944
-
945
- def store_content(self, section: str, field: str, value: str) -> None:
946
- """Store the key value within a document section in the docling document.
947
-
948
- Args:
949
- section: A patent section name.
950
- field: A field name.
951
- value: A field value name.
952
- """
953
- if (
954
- not self.doc
955
- or not field
956
- or field not in [item.value for item in PatentUsptoGrantAps.Field]
957
- ):
958
- return
959
-
960
- if field == self.Field.TITLE.value:
961
- self.parents[self.level + 1] = self.doc.add_title(
962
- parent=self.parents[self.level], text=value
963
- )
964
- self.level += 1
965
-
966
- elif field == self.Field.TEXT.value and section == self.Section.ABSTRACT.value:
967
- abst_item = self.get_last_text_item()
968
- if abst_item:
969
- abst_item.text += " " + value
970
- else:
971
- self.doc.add_text(
972
- label=DocItemLabel.PARAGRAPH,
973
- text=value,
974
- parent=self.parents[self.level],
975
- )
976
-
977
- elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
978
- self.doc.add_text(
979
- label=DocItemLabel.PARAGRAPH,
980
- text="",
981
- parent=self.parents[self.level],
982
- )
983
-
984
- elif (
985
- field
986
- in (
987
- self.Field.PARAGRAPH.value,
988
- self.Field.PARAGRAPH_1.value,
989
- self.Field.PARAGRAPH_2.value,
990
- self.Field.PARAGRAPH_3.value,
991
- )
992
- and section == self.Section.CLAIMS.value
993
- ):
994
- last_claim = self.get_last_text_item()
995
- if last_claim is None:
996
- last_claim = self.doc.add_text(
997
- label=DocItemLabel.PARAGRAPH,
998
- text="",
999
- parent=self.parents[self.level],
1000
- )
1001
-
1002
- last_claim.text += f" {value}" if last_claim.text else value
1003
-
1004
- elif field == self.Field.CAPTION.value and section in (
1005
- self.Section.SUMMARY.value,
1006
- self.Section.DETAILS.value,
1007
- self.Section.DRAWINGS.value,
1008
- ):
1009
- # captions are siblings of abstract since no level info is provided
1010
- head_item = PatentHeading.ABSTRACT
1011
- self.level = head_item.level if head_item.level in self.parents else 1
1012
- self.parents[self.level + 1] = self.doc.add_heading(
1013
- value,
1014
- level=self.level,
1015
- parent=self.parents[self.level],
1016
- )
1017
- self.level += 1
1018
-
1019
- elif field in (
1020
- self.Field.PARAGRAPH.value,
1021
- self.Field.PARAGRAPH_1.value,
1022
- self.Field.PARAGRAPH_2.value,
1023
- self.Field.PARAGRAPH_3.value,
1024
- ) and section in (
1025
- self.Section.SUMMARY.value,
1026
- self.Section.DETAILS.value,
1027
- self.Section.DRAWINGS.value,
1028
- ):
1029
- self.doc.add_text(
1030
- label=DocItemLabel.PARAGRAPH,
1031
- text=value,
1032
- parent=self.parents[self.level],
1033
- )
1034
-
1035
- def parse(self, patent_content: str) -> Optional[DoclingDocument]:
1036
- self.doc = self.doc = DoclingDocument(name="file")
1037
- section: str = ""
1038
- key: str = ""
1039
- value: str = ""
1040
- line_num = 0
1041
- for line in patent_content.splitlines():
1042
- cols = re.split("\\s{2,}", line, maxsplit=1)
1043
- if key and value and (len(cols) == 1 or (len(cols) == 2 and cols[0])):
1044
- self.store_content(section, key, value)
1045
- key = ""
1046
- value = ""
1047
- if len(cols) == 1: # section title
1048
- section = cols[0]
1049
- self.store_section(section)
1050
- _log.debug(f"Parsing section {section}")
1051
- elif len(cols) == 2: # key value
1052
- if cols[0]: # key present
1053
- key = cols[0]
1054
- value = cols[1]
1055
- elif not re.match(r"^##STR\d+##$", cols[1]): # line continues
1056
- value += " " + cols[1]
1057
- line_num += 1
1058
- if key and value:
1059
- self.store_content(section, key, value)
1060
-
1061
- # TODO: parse tables
1062
- return self.doc
1063
-
1064
-
1065
- class PatentUsptoAppV1(PatentUspto):
1066
- """Parser of patent documents from the US Patent Office (applications v1.x)
1067
-
1068
- The compatible format is:
1069
- - Patent Application Full Text Data/XML Version 1.x (from March 2001 till December
1070
- 2004)
1071
- """
1072
-
1073
- @override
1074
- def __init__(self) -> None:
1075
- """Build an instance of PatentUsptoAppV1 class."""
1076
- self.handler = PatentUsptoAppV1.PatentHandler()
1077
- self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
1078
-
1079
- @override
1080
- def parse(self, patent_content: str) -> Optional[DoclingDocument]:
1081
- try:
1082
- xml.sax.parseString(patent_content, self.handler)
1083
- except xml.sax._exceptions.SAXParseException as exc_sax:
1084
- _log.error(f"Error in parsing USPTO document: {exc_sax}")
1085
-
1086
- return None
1087
-
1088
- doc = self.handler.doc
1089
- if doc:
1090
- raw_tables = re.findall(self.pattern, patent_content)
1091
- parsed_tables: list[TableData] = []
1092
- _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
1093
- for table in raw_tables:
1094
- table_parser = XmlTable(XML_DECLARATION + "\n" + table)
1095
- try:
1096
- table_data = table_parser.parse()
1097
- if table_data:
1098
- parsed_tables.append(table_data)
1099
- except Exception as exc_table:
1100
- _log.error(f"Error in parsing USPTO tables: {exc_table}")
1101
- if len(parsed_tables) != len(doc.tables):
1102
- _log.error(
1103
- f"Number of referenced ({len(doc.tables)}) and parsed "
1104
- f"({len(parsed_tables)}) tables differ."
1105
- )
1106
- else:
1107
- for idx, item in enumerate(parsed_tables):
1108
- doc.tables[idx].data = item
1109
-
1110
- return doc
1111
-
1112
- class PatentHandler(xml.sax.handler.ContentHandler):
1113
- """SAX ContentHandler for patent documents."""
1114
-
1115
- APP_DOC_ELEMENT: Final = "patent-application-publication"
1116
-
1117
- @unique
1118
- class Element(Enum):
1119
- """Represents an element of interest in the patent application document."""
1120
-
1121
- DRAWINGS = "brief-description-of-drawings", False
1122
- ABSTRACT = "subdoc-abstract", False
1123
- TITLE = "title-of-invention", True
1124
- CLAIMS = "subdoc-claims", False
1125
- CLAIM = "claim", False
1126
- CLAIM_TEXT = "claim-text", True
1127
- NUMBER = ("number", False)
1128
- PARAGRAPH = "paragraph", True
1129
- HEADING = "heading", True
1130
- STYLE_SUPERSCRIPT = "superscript", True
1131
- STYLE_SUBSCRIPT = "subscript", True
1132
- # do not store text of a table, since it can be within paragraph
1133
- TABLE = "table", False
1134
- # do not store text of a formula, since it can be within paragraph
1135
- MATH = "math-cwu", False
1136
-
1137
- @override
1138
- def __new__(cls, value: str, _) -> Self:
1139
- obj = object.__new__(cls)
1140
- obj._value_ = value
1141
- return obj
1142
-
1143
- @override
1144
- def __init__(self, _, is_text: bool) -> None:
1145
- self.is_text: bool = is_text
1146
-
1147
- @override
1148
- def __init__(self) -> None:
1149
- """Build an instance of the patent handler."""
1150
- # Current patent being parsed
1151
- self.doc: Optional[DoclingDocument] = None
1152
- # Keep track of docling hierarchy level
1153
- self.level: LevelNumber = 1
1154
- # Keep track of docling parents by level
1155
- self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
1156
- # Content to retain for the current patent
1157
- self.property: list[str]
1158
- self.claim: str
1159
- self.claims: list[str]
1160
- self.abstract: str
1161
- self.text: str
1162
- self._clean_data()
1163
- # To handle mathematical styling
1164
- self.style_html = HtmlEntity()
1165
-
1166
- @override
1167
- def startElement(self, tag, attributes): # noqa: N802
1168
- """Signal the start of an element.
1169
-
1170
- Args:
1171
- tag: The element tag.
1172
- attributes: The element attributes.
1173
- """
1174
- if tag == self.APP_DOC_ELEMENT:
1175
- self.doc = DoclingDocument(name="file")
1176
- self.text = ""
1177
- self._start_registered_elements(tag, attributes)
1178
-
1179
- @override
1180
- def skippedEntity(self, name): # noqa: N802
1181
- """Receive notification of a skipped entity.
1182
-
1183
- HTML entities will be skipped by the parser. This method will unescape them
1184
- and add them to the text.
1185
-
1186
- Args:
1187
- name: Entity name.
1188
- """
1189
- if self.property:
1190
- elm_val = self.property[-1]
1191
- element = self.Element(elm_val)
1192
- if element.is_text:
1193
- escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
1194
- unescaped = html.unescape(escaped)
1195
- if unescaped == escaped:
1196
- logging.debug("Unrecognized HTML entity: " + name)
1197
- return
1198
-
1199
- if element in (
1200
- self.Element.STYLE_SUPERSCRIPT,
1201
- self.Element.STYLE_SUBSCRIPT,
1202
- ):
1203
- # superscripts and subscripts need to be under text elements
1204
- if len(self.property) < 2:
1205
- return
1206
- parent_val = self.property[-2]
1207
- parent = self.Element(parent_val)
1208
- if parent.is_text:
1209
- self.text += self._apply_style(unescaped, elm_val)
1210
- else:
1211
- self.text += unescaped
1212
-
1213
- @override
1214
- def endElement(self, tag): # noqa: N802
1215
- """Signal the end of an element.
1216
-
1217
- Args:
1218
- tag: The element tag.
1219
- """
1220
- if tag == self.APP_DOC_ELEMENT:
1221
- self._clean_data()
1222
- self._end_registered_element(tag)
1223
-
1224
- @override
1225
- def characters(self, content):
1226
- """Receive notification of character data.
1227
-
1228
- Args:
1229
- content: Data reported by the handler.
1230
- """
1231
- if self.property:
1232
- elm_val = self.property[-1]
1233
- element = self.Element(elm_val)
1234
- if element.is_text:
1235
- if element in (
1236
- self.Element.STYLE_SUPERSCRIPT,
1237
- self.Element.STYLE_SUBSCRIPT,
1238
- ):
1239
- # superscripts and subscripts need to be under text elements
1240
- if len(self.property) < 2:
1241
- return
1242
- parent_val = self.property[-2]
1243
- parent = self.Element(parent_val)
1244
- if parent.is_text:
1245
- self.text += self._apply_style(content, elm_val)
1246
- else:
1247
- self.text += content
1248
-
1249
- def _start_registered_elements(
1250
- self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
1251
- ) -> None:
1252
- if tag in [member.value for member in self.Element]:
1253
- # special case for claims: claim lines may start before the
1254
- # previous one is closed
1255
- if (
1256
- tag == self.Element.CLAIM_TEXT.value
1257
- and self.property
1258
- and self.property[-1] == tag
1259
- and self.text.strip()
1260
- ):
1261
- self.claim += " " + self.text.strip("\n")
1262
- self.text = ""
1263
- elif tag == self.Element.HEADING.value:
1264
- level_attr: str = attributes.get("lvl", "")
1265
- new_level: int = int(level_attr) if level_attr.isnumeric() else 1
1266
- max_level = min(self.parents.keys())
1267
- # increase heading level with 1 for title, if any
1268
- self.level = (
1269
- new_level + 1 if (new_level + 1) in self.parents else max_level
1270
- )
1271
- self.property.append(tag)
1272
-
1273
- def _end_registered_element(self, tag: str) -> None:
1274
- if tag in [elm.value for elm in self.Element] and self.property:
1275
- current_tag = self.property.pop()
1276
- self._add_property(current_tag, self.text)
1277
-
1278
- def _add_property(self, name: str, text: str) -> None:
1279
- if not name or not self.doc:
1280
- return
1281
-
1282
- if name == self.Element.TITLE.value:
1283
- title = text.strip()
1284
- if title:
1285
- self.parents[self.level + 1] = self.doc.add_text(
1286
- parent=self.parents[self.level],
1287
- label=DocItemLabel.TITLE,
1288
- text=title,
1289
- )
1290
- self.level += 1
1291
- self.text = ""
1292
- elif name == self.Element.ABSTRACT.value:
1293
- abstract = self.abstract.strip()
1294
- if abstract:
1295
- heading_text = PatentHeading.ABSTRACT.value
1296
- heading_level = (
1297
- PatentHeading.ABSTRACT.level
1298
- if PatentHeading.ABSTRACT.level in self.parents
1299
- else 1
1300
- )
1301
- abstract_item = self.doc.add_heading(
1302
- heading_text,
1303
- level=heading_level,
1304
- parent=self.parents[heading_level],
1305
- )
1306
- self.doc.add_text(
1307
- label=DocItemLabel.PARAGRAPH,
1308
- text=self.abstract,
1309
- parent=abstract_item,
1310
- )
1311
- self.abstract = ""
1312
- self.text = ""
1313
- elif name == self.Element.CLAIM_TEXT.value:
1314
- if text:
1315
- self.claim += self.text.strip("\n")
1316
- self.text = ""
1317
-
1318
- elif name == self.Element.CLAIM.value:
1319
- claim = self.claim.strip()
1320
- if claim:
1321
- self.claims.append(claim)
1322
- self.claim = ""
1323
-
1324
- elif name == self.Element.CLAIMS.value and self.claims:
1325
- heading_text = PatentHeading.CLAIMS.value
1326
- heading_level = (
1327
- PatentHeading.CLAIMS.level
1328
- if PatentHeading.CLAIMS.level in self.parents
1329
- else 1
1330
- )
1331
- claims_item = self.doc.add_heading(
1332
- heading_text,
1333
- level=heading_level,
1334
- parent=self.parents[heading_level],
1335
- )
1336
- for text in self.claims:
1337
- self.doc.add_text(
1338
- label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
1339
- )
1340
-
1341
- elif name in (
1342
- self.Element.PARAGRAPH.value,
1343
- self.Element.HEADING.value,
1344
- ):
1345
- if text and self.Element.ABSTRACT.value in self.property:
1346
- self.abstract = (self.abstract + text) if self.abstract else text
1347
- elif text.strip():
1348
- text = re.sub("\\s+", " ", text).strip()
1349
- if name == self.Element.HEADING.value:
1350
- self.parents[self.level + 1] = self.doc.add_heading(
1351
- text=text,
1352
- level=self.level,
1353
- parent=self.parents[self.level],
1354
- )
1355
- self.level += 1
1356
- else:
1357
- self.doc.add_text(
1358
- label=DocItemLabel.PARAGRAPH,
1359
- text=text,
1360
- parent=self.parents[self.level],
1361
- )
1362
- self.text = ""
1363
-
1364
- elif name == self.Element.TABLE.value:
1365
- # set an empty table as placeholder
1366
- empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1367
- self.doc.add_table(
1368
- data=empty_table,
1369
- parent=self.parents[self.level],
1370
- )
1371
-
1372
- def _apply_style(self, text: str, style_tag: str) -> str:
1373
- """Apply an HTML style to text.
1374
-
1375
- Args:
1376
- text: A string containing plain text.
1377
- style_tag: An HTML tag name for styling text. If the tag name is not
1378
- recognized as one of the supported styles, the method will return
1379
- the original `text`.
1380
-
1381
- Returns:
1382
- A string after applying the style.
1383
- """
1384
- formatted = html.unescape(text)
1385
-
1386
- if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
1387
- formatted = html.unescape(self.style_html.get_superscript(formatted))
1388
- elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
1389
- formatted = html.unescape(self.style_html.get_subscript(formatted))
1390
-
1391
- return formatted
1392
-
1393
- def _clean_data(self):
1394
- """Reset the variables from stream data."""
1395
- self.property = []
1396
- self.abstract = ""
1397
- self.claim = ""
1398
- self.claims = []
1399
- self.text = ""
1400
-
1401
-
1402
- class XmlTable:
1403
- """Provide a table parser for xml tables in USPTO patent documents.
1404
-
1405
- The OASIS Open XML Exchange Table Model can be downloaded from:
1406
- http://oasis-open.org/specs/soextblx.dtd
1407
- """
1408
-
1409
- class MinColInfoType(TypedDict):
1410
- offset: list[int]
1411
- colwidth: list[int]
1412
-
1413
- class ColInfoType(MinColInfoType):
1414
- cell_range: list[int]
1415
- cell_offst: list[int]
1416
-
1417
- def __init__(self, input: str) -> None:
1418
- """Initialize the table parser with the xml content.
1419
-
1420
- Args:
1421
- input: The xml content.
1422
- """
1423
- self.max_nbr_messages = 2
1424
- self.nbr_messages = 0
1425
- self.empty_text = ""
1426
- self._soup = BeautifulSoup(input, features="xml")
1427
-
1428
- def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
1429
- """Create a unified range along the table groups.
1430
-
1431
- Args:
1432
- tgs: Table group column specifications.
1433
-
1434
- Returns:
1435
- Unified group column specifications.
1436
- """
1437
- colinfo: dict[int, XmlTable.ColInfoType] = {}
1438
-
1439
- if len(tgs) == 0:
1440
- return colinfo
1441
-
1442
- for itg, tg in enumerate(tgs):
1443
- colinfo[itg] = {
1444
- "offset": [],
1445
- "colwidth": [],
1446
- "cell_range": [],
1447
- "cell_offst": [0],
1448
- }
1449
- offst = 0
1450
- for info in tg["colinfo"]:
1451
- cw = info["colwidth"]
1452
- cw = re.sub("pt", "", cw, flags=re.I)
1453
- cw = re.sub("mm", "", cw, flags=re.I)
1454
- try:
1455
- cw = int(cw)
1456
- except BaseException:
1457
- cw = float(cw)
1458
- colinfo[itg]["colwidth"].append(cw)
1459
- colinfo[itg]["offset"].append(offst)
1460
- offst += cw
1461
- colinfo[itg]["offset"].append(offst)
1462
-
1463
- min_colinfo: XmlTable.MinColInfoType = {"offset": [], "colwidth": []}
1464
-
1465
- min_colinfo["offset"] = colinfo[0]["offset"]
1466
- offset_w0 = []
1467
- for itg, col in colinfo.items():
1468
- # keep track of col with 0 width
1469
- for ic, cw in enumerate(col["colwidth"]):
1470
- if cw == 0:
1471
- offset_w0.append(col["offset"][ic])
1472
-
1473
- min_colinfo["offset"] = sorted(
1474
- list(set(col["offset"] + min_colinfo["offset"]))
1475
- )
1476
-
1477
- # add back the 0 width cols to offset list
1478
- offset_w0 = list(set(offset_w0))
1479
- min_colinfo["offset"] = sorted(min_colinfo["offset"] + offset_w0)
1480
-
1481
- for i in range(len(min_colinfo["offset"]) - 1):
1482
- min_colinfo["colwidth"].append(
1483
- min_colinfo["offset"][i + 1] - min_colinfo["offset"][i]
1484
- )
1485
-
1486
- for itg, col in colinfo.items():
1487
- i = 1
1488
- range_ = 1
1489
- for min_i in range(1, len(min_colinfo["offset"])):
1490
- min_offst = min_colinfo["offset"][min_i]
1491
- offst = col["offset"][i]
1492
- if min_offst == offst:
1493
- if (
1494
- len(col["offset"]) == i + 1
1495
- and len(min_colinfo["offset"]) > min_i + 1
1496
- ):
1497
- range_ += 1
1498
- else:
1499
- col["cell_range"].append(range_)
1500
- col["cell_offst"].append(col["cell_offst"][-1] + range_)
1501
- range_ = 1
1502
- i += 1
1503
- elif min_offst < offst:
1504
- range_ += 1
1505
- else:
1506
- _log.debug("A USPTO XML table has wrong offsets.")
1507
- return {}
1508
-
1509
- return colinfo
1510
-
1511
- def _get_max_ncols(self, tgs_info: dict[int, ColInfoType]) -> NonNegativeInt:
1512
- """Get the maximum number of columns across table groups.
1513
-
1514
- Args:
1515
- tgs_info: Unified group column specifications.
1516
-
1517
- Return:
1518
- The maximum number of columns.
1519
- """
1520
- ncols_max = 0
1521
- for rowinfo in tgs_info.values():
1522
- ncols_max = max(ncols_max, len(rowinfo["colwidth"]))
1523
-
1524
- return ncols_max
1525
-
1526
- def _parse_table(self, table: Tag) -> TableData:
1527
- """Parse the content of a table tag.
1528
-
1529
- Args:
1530
- The table element.
1531
-
1532
- Returns:
1533
- A docling table object.
1534
- """
1535
- tgs_align = []
1536
- tg_secs = table.find_all("tgroup")
1537
- if tg_secs:
1538
- for tg_sec in tg_secs:
1539
- ncols = tg_sec.get("cols", None)
1540
- if ncols:
1541
- ncols = int(ncols)
1542
- tg_align = {"ncols": ncols, "colinfo": []}
1543
- cs_secs = tg_sec.find_all("colspec")
1544
- if cs_secs:
1545
- for cs_sec in cs_secs:
1546
- colname = cs_sec.get("colname", None)
1547
- colwidth = cs_sec.get("colwidth", None)
1548
- tg_align["colinfo"].append(
1549
- {"colname": colname, "colwidth": colwidth}
1550
- )
1551
-
1552
- tgs_align.append(tg_align)
1553
-
1554
- # create unified range along the table groups
1555
- tgs_range = self._create_tg_range(tgs_align)
1556
-
1557
- # if the structure is broken, return an empty table
1558
- if not tgs_range:
1559
- dl_table = TableData(num_rows=0, num_cols=0, table_cells=[])
1560
- return dl_table
1561
-
1562
- ncols_max = self._get_max_ncols(tgs_range)
1563
-
1564
- # extract table data
1565
- table_data: list[TableCell] = []
1566
- i_row_global = 0
1567
- is_row_empty: bool = True
1568
- tg_secs = table.find_all("tgroup")
1569
- if tg_secs:
1570
- for itg, tg_sec in enumerate(tg_secs):
1571
- tg_range = tgs_range[itg]
1572
- row_secs = tg_sec.find_all(["row", "tr"])
1573
-
1574
- if row_secs:
1575
- for row_sec in row_secs:
1576
- entry_secs = row_sec.find_all(["entry", "td"])
1577
- is_header: bool = row_sec.parent.name in ["thead"]
1578
-
1579
- ncols = 0
1580
- local_row: list[TableCell] = []
1581
- is_row_empty = True
1582
- if entry_secs:
1583
- wrong_nbr_cols = False
1584
- for ientry, entry_sec in enumerate(entry_secs):
1585
- text = entry_sec.get_text().strip()
1586
-
1587
- # start-end
1588
- namest = entry_sec.attrs.get("namest", None)
1589
- nameend = entry_sec.attrs.get("nameend", None)
1590
- if isinstance(namest, str) and namest.isnumeric():
1591
- namest = int(namest)
1592
- else:
1593
- namest = ientry + 1
1594
- if isinstance(nameend, str) and nameend.isnumeric():
1595
- nameend = int(nameend)
1596
- shift = 0
1597
- else:
1598
- nameend = ientry + 2
1599
- shift = 1
1600
-
1601
- if nameend > len(tg_range["cell_offst"]):
1602
- wrong_nbr_cols = True
1603
- self.nbr_messages += 1
1604
- if self.nbr_messages <= self.max_nbr_messages:
1605
- _log.debug(
1606
- "USPTO table has # entries != # columns"
1607
- )
1608
- break
1609
-
1610
- range_ = [
1611
- tg_range["cell_offst"][namest - 1],
1612
- tg_range["cell_offst"][nameend - 1] - shift,
1613
- ]
1614
-
1615
- # add row and replicate cell if needed
1616
- cell_text = text if text else self.empty_text
1617
- if cell_text != self.empty_text:
1618
- is_row_empty = False
1619
- for irep in range(range_[0], range_[1] + 1):
1620
- ncols += 1
1621
- local_row.append(
1622
- TableCell(
1623
- column_header=is_header,
1624
- text=cell_text,
1625
- start_row_offset_idx=i_row_global,
1626
- end_row_offset_idx=i_row_global + 1,
1627
- row_span=1,
1628
- start_col_offset_idx=range_[0],
1629
- end_col_offset_idx=range_[1] + 1,
1630
- col_span=range_[1] - range_[0] + 1,
1631
- )
1632
- )
1633
-
1634
- if wrong_nbr_cols:
1635
- # keep empty text, not to introduce noise
1636
- local_row = []
1637
- ncols = 0
1638
-
1639
- # add empty cell up to ncols_max
1640
- for irep in range(ncols, ncols_max):
1641
- local_row.append(
1642
- TableCell(
1643
- column_header=is_header,
1644
- text=self.empty_text,
1645
- start_row_offset_idx=i_row_global,
1646
- end_row_offset_idx=i_row_global + 1,
1647
- row_span=1,
1648
- start_col_offset_idx=irep,
1649
- end_col_offset_idx=irep + 1,
1650
- col_span=1,
1651
- )
1652
- )
1653
- # do not add empty rows
1654
- if not is_row_empty:
1655
- table_data.extend(local_row)
1656
- i_row_global += 1
1657
-
1658
- dl_table = TableData(
1659
- num_rows=i_row_global, num_cols=ncols_max, table_cells=table_data
1660
- )
1661
-
1662
- return dl_table
1663
-
1664
- def parse(self) -> Optional[TableData]:
1665
- """Parse the first table from an xml content.
1666
-
1667
- Returns:
1668
- A docling table data.
1669
- """
1670
- section = self._soup.find("table")
1671
- if section is not None:
1672
- table = self._parse_table(section)
1673
- if table.num_rows == 0 or table.num_cols == 0:
1674
- _log.warning("The parsed USPTO table is empty")
1675
- return table
1676
- else:
1677
- return None
1678
-
1679
-
1680
- class HtmlEntity:
1681
- """Provide utility functions to get the HTML entities of styled characters.
1682
-
1683
- This class has been developped from:
1684
- https://unicode-table.com/en/html-entities/
1685
- https://www.w3.org/TR/WD-math-970515/table03.html
1686
- """
1687
-
1688
- def __init__(self):
1689
- """Initialize this class by loading the HTML entity dictionaries."""
1690
- self.superscript = str.maketrans(
1691
- {
1692
- "1": "&sup1;",
1693
- "2": "&sup2;",
1694
- "3": "&sup3;",
1695
- "4": "&#8308;",
1696
- "5": "&#8309;",
1697
- "6": "&#8310;",
1698
- "7": "&#8311;",
1699
- "8": "&#8312;",
1700
- "9": "&#8313;",
1701
- "0": "&#8304;",
1702
- "+": "&#8314;",
1703
- "-": "&#8315;",
1704
- "−": "&#8315;",
1705
- "=": "&#8316;",
1706
- "(": "&#8317;",
1707
- ")": "&#8318;",
1708
- "a": "&#170;",
1709
- "o": "&#186;",
1710
- "i": "&#8305;",
1711
- "n": "&#8319;",
1712
- }
1713
- )
1714
- self.subscript = str.maketrans(
1715
- {
1716
- "1": "&#8321;",
1717
- "2": "&#8322;",
1718
- "3": "&#8323;",
1719
- "4": "&#8324;",
1720
- "5": "&#8325;",
1721
- "6": "&#8326;",
1722
- "7": "&#8327;",
1723
- "8": "&#8328;",
1724
- "9": "&#8329;",
1725
- "0": "&#8320;",
1726
- "+": "&#8330;",
1727
- "-": "&#8331;",
1728
- "−": "&#8331;",
1729
- "=": "&#8332;",
1730
- "(": "&#8333;",
1731
- ")": "&#8334;",
1732
- "a": "&#8336;",
1733
- "e": "&#8337;",
1734
- "o": "&#8338;",
1735
- "x": "&#8339;",
1736
- }
1737
- )
1738
- self.mathematical_italic = str.maketrans(
1739
- {
1740
- "A": "&#119860;",
1741
- "B": "&#119861;",
1742
- "C": "&#119862;",
1743
- "D": "&#119863;",
1744
- "E": "&#119864;",
1745
- "F": "&#119865;",
1746
- "G": "&#119866;",
1747
- "H": "&#119867;",
1748
- "I": "&#119868;",
1749
- "J": "&#119869;",
1750
- "K": "&#119870;",
1751
- "L": "&#119871;",
1752
- "M": "&#119872;",
1753
- "N": "&#119873;",
1754
- "O": "&#119874;",
1755
- "P": "&#119875;",
1756
- "Q": "&#119876;",
1757
- "R": "&#119877;",
1758
- "S": "&#119878;",
1759
- "T": "&#119879;",
1760
- "U": "&#119880;",
1761
- "V": "&#119881;",
1762
- "W": "&#119882;",
1763
- "Y": "&#119884;",
1764
- "Z": "&#119885;",
1765
- "a": "&#119886;",
1766
- "b": "&#119887;",
1767
- "c": "&#119888;",
1768
- "d": "&#119889;",
1769
- "e": "&#119890;",
1770
- "f": "&#119891;",
1771
- "g": "&#119892;",
1772
- "h": "&#119893;",
1773
- "i": "&#119894;",
1774
- "j": "&#119895;",
1775
- "k": "&#119896;",
1776
- "l": "&#119897;",
1777
- "m": "&#119898;",
1778
- "n": "&#119899;",
1779
- "o": "&#119900;",
1780
- "p": "&#119901;",
1781
- "q": "&#119902;",
1782
- "r": "&#119903;",
1783
- "s": "&#119904;",
1784
- "t": "&#119905;",
1785
- "u": "&#119906;",
1786
- "v": "&#119907;",
1787
- "w": "&#119908;",
1788
- "x": "&#119909;",
1789
- "y": "&#119910;",
1790
- "z": "&#119911;",
1791
- }
1792
- )
1793
-
1794
- self.lookup_iso8879 = {
1795
- "&Agr;": "&Alpha;",
1796
- "&Bgr;": "&Beta;",
1797
- "&Ggr;": "&Gamma;",
1798
- "&Dgr;": "&Delta;",
1799
- "&Egr;": "&Epsilon;",
1800
- "&Zgr;": "&Zeta;",
1801
- "&EEgr;": "&Eta;",
1802
- "&THgr;": "&Theta;",
1803
- "&Igr;": "&Iota;",
1804
- "&Kgr;": "&Kappa;",
1805
- "&Lgr;": "&Lambda;",
1806
- "&Mgr;": "&Mu;",
1807
- "&Ngr;": "&Nu;",
1808
- "&Xgr;": "&Xi;",
1809
- "&Ogr;": "&Omicron;",
1810
- "&Pgr;": "&Pi;",
1811
- "&Rgr;": "&Rho;",
1812
- "&Sgr;": "&Sigma;",
1813
- "&Tgr;": "&Tau;",
1814
- "&Ugr;": "&Upsilon;",
1815
- "&PHgr;": "&Phi;",
1816
- "&KHgr;": "&Chi;",
1817
- "&PSgr;": "&Psi;",
1818
- "&OHgr;": "&Omega;",
1819
- "&agr;": "&alpha;",
1820
- "&bgr;": "&beta;",
1821
- "&ggr;": "&gamma;",
1822
- "&dgr;": "&delta;",
1823
- "&egr;": "&epsilon;",
1824
- "&zgr;": "&zeta;",
1825
- "&eegr;": "&eta;",
1826
- "&thgr;": "&theta;",
1827
- "&igr;": "&iota;",
1828
- "&kgr;": "&kappa;",
1829
- "&lgr;": "&lambda;",
1830
- "&mgr;": "&mu;",
1831
- "&ngr;": "&nu;",
1832
- "&xgr;": "&xi;",
1833
- "&ogr;": "&omicron;",
1834
- "&pgr;": "&pi;",
1835
- "&rgr;": "&rho;",
1836
- "&sgr;": "&sigmaf;",
1837
- "&tgr;": "&tau;",
1838
- "&ugr;": "&upsilon;",
1839
- "&phgr;": "&phi;",
1840
- "&khgr;": "&chi;",
1841
- "&psgr;": "&psi;",
1842
- "&ohgr;": "&omega;",
1843
- }
1844
-
1845
- def get_superscript(self, text: str) -> str:
1846
- """Get a text in superscript as HTML entities.
1847
-
1848
- Args:
1849
- text: The text to transform.
1850
-
1851
- Returns:
1852
- The text in superscript as HTML entities.
1853
- """
1854
- return text.translate(self.superscript)
1855
-
1856
- def get_subscript(self, text: str) -> str:
1857
- """Get a text in subscript as HTML entities.
1858
-
1859
- Args:
1860
- The text to transform.
1861
-
1862
- Returns:
1863
- The text in subscript as HTML entities.
1864
- """
1865
- return text.translate(self.subscript)
1866
-
1867
- def get_math_italic(self, text: str) -> str:
1868
- """Get a text in italic as HTML entities.
1869
-
1870
- Args:
1871
- The text to transform.
1872
-
1873
- Returns:
1874
- The text in italics as HTML entities.
1875
- """
1876
- return text.translate(self.mathematical_italic)
1877
-
1878
- def get_greek_from_iso8879(self, text: str) -> str:
1879
- """Get an HTML entity of a greek letter in ISO 8879.
1880
-
1881
- Args:
1882
- The text to transform, as an ISO 8879 entitiy.
1883
-
1884
- Returns:
1885
- The HTML entity representing a greek letter. If the input text is not
1886
- supported, the original text is returned.
1887
- """
1888
- return self.lookup_iso8879.get(text, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/chunking/__init__.py DELETED
@@ -1,12 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
7
- from docling_core.transforms.chunker.hierarchical_chunker import (
8
- DocChunk,
9
- DocMeta,
10
- HierarchicalChunker,
11
- )
12
- from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/cli/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/cli/main.py DELETED
@@ -1,456 +0,0 @@
1
- import importlib
2
- import logging
3
- import platform
4
- import re
5
- import sys
6
- import tempfile
7
- import time
8
- import warnings
9
- from pathlib import Path
10
- from typing import Annotated, Dict, Iterable, List, Optional, Type
11
-
12
- import typer
13
- from docling_core.types.doc import ImageRefMode
14
- from docling_core.utils.file import resolve_source_to_path
15
- from pydantic import TypeAdapter
16
-
17
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
- from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
19
- from docling.backend.pdf_backend import PdfDocumentBackend
20
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
21
- from docling.datamodel.base_models import (
22
- ConversionStatus,
23
- FormatToExtensions,
24
- InputFormat,
25
- OutputFormat,
26
- )
27
- from docling.datamodel.document import ConversionResult
28
- from docling.datamodel.pipeline_options import (
29
- AcceleratorDevice,
30
- AcceleratorOptions,
31
- EasyOcrOptions,
32
- OcrEngine,
33
- OcrMacOptions,
34
- OcrOptions,
35
- PdfBackend,
36
- PdfPipelineOptions,
37
- RapidOcrOptions,
38
- TableFormerMode,
39
- TesseractCliOcrOptions,
40
- TesseractOcrOptions,
41
- )
42
- from docling.datamodel.settings import settings
43
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
44
-
45
- warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
46
- warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
47
-
48
- _log = logging.getLogger(__name__)
49
- from rich.console import Console
50
-
51
- err_console = Console(stderr=True)
52
-
53
-
54
- app = typer.Typer(
55
- name="Docling",
56
- no_args_is_help=True,
57
- add_completion=False,
58
- pretty_exceptions_enable=False,
59
- )
60
-
61
-
62
- def version_callback(value: bool):
63
- if value:
64
- docling_version = importlib.metadata.version("docling")
65
- docling_core_version = importlib.metadata.version("docling-core")
66
- docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
67
- docling_parse_version = importlib.metadata.version("docling-parse")
68
- platform_str = platform.platform()
69
- py_impl_version = sys.implementation.cache_tag
70
- py_lang_version = platform.python_version()
71
- print(f"Docling version: {docling_version}")
72
- print(f"Docling Core version: {docling_core_version}")
73
- print(f"Docling IBM Models version: {docling_ibm_models_version}")
74
- print(f"Docling Parse version: {docling_parse_version}")
75
- print(f"Python: {py_impl_version} ({py_lang_version})")
76
- print(f"Platform: {platform_str}")
77
- raise typer.Exit()
78
-
79
-
80
- def export_documents(
81
- conv_results: Iterable[ConversionResult],
82
- output_dir: Path,
83
- export_json: bool,
84
- export_html: bool,
85
- export_md: bool,
86
- export_txt: bool,
87
- export_doctags: bool,
88
- image_export_mode: ImageRefMode,
89
- ):
90
-
91
- success_count = 0
92
- failure_count = 0
93
-
94
- for conv_res in conv_results:
95
- if conv_res.status == ConversionStatus.SUCCESS:
96
- success_count += 1
97
- doc_filename = conv_res.input.file.stem
98
-
99
- # Export JSON format:
100
- if export_json:
101
- fname = output_dir / f"{doc_filename}.json"
102
- _log.info(f"writing JSON output to {fname}")
103
- conv_res.document.save_as_json(
104
- filename=fname, image_mode=image_export_mode
105
- )
106
-
107
- # Export HTML format:
108
- if export_html:
109
- fname = output_dir / f"{doc_filename}.html"
110
- _log.info(f"writing HTML output to {fname}")
111
- conv_res.document.save_as_html(
112
- filename=fname, image_mode=image_export_mode
113
- )
114
-
115
- # Export Text format:
116
- if export_txt:
117
- fname = output_dir / f"{doc_filename}.txt"
118
- _log.info(f"writing TXT output to {fname}")
119
- conv_res.document.save_as_markdown(
120
- filename=fname,
121
- strict_text=True,
122
- image_mode=ImageRefMode.PLACEHOLDER,
123
- )
124
-
125
- # Export Markdown format:
126
- if export_md:
127
- fname = output_dir / f"{doc_filename}.md"
128
- _log.info(f"writing Markdown output to {fname}")
129
- conv_res.document.save_as_markdown(
130
- filename=fname, image_mode=image_export_mode
131
- )
132
-
133
- # Export Document Tags format:
134
- if export_doctags:
135
- fname = output_dir / f"{doc_filename}.doctags"
136
- _log.info(f"writing Doc Tags output to {fname}")
137
- conv_res.document.save_as_document_tokens(filename=fname)
138
-
139
- else:
140
- _log.warning(f"Document {conv_res.input.file} failed to convert.")
141
- failure_count += 1
142
-
143
- _log.info(
144
- f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
145
- )
146
-
147
-
148
- def _split_list(raw: Optional[str]) -> Optional[List[str]]:
149
- if raw is None:
150
- return None
151
- return re.split(r"[;,]", raw)
152
-
153
-
154
- @app.command(no_args_is_help=True)
155
- def convert(
156
- input_sources: Annotated[
157
- List[str],
158
- typer.Argument(
159
- ...,
160
- metavar="source",
161
- help="PDF files to convert. Can be local file / directory paths or URL.",
162
- ),
163
- ],
164
- from_formats: List[InputFormat] = typer.Option(
165
- None,
166
- "--from",
167
- help="Specify input formats to convert from. Defaults to all formats.",
168
- ),
169
- to_formats: List[OutputFormat] = typer.Option(
170
- None, "--to", help="Specify output formats. Defaults to Markdown."
171
- ),
172
- headers: str = typer.Option(
173
- None,
174
- "--headers",
175
- help="Specify http request headers used when fetching url input sources in the form of a JSON string",
176
- ),
177
- image_export_mode: Annotated[
178
- ImageRefMode,
179
- typer.Option(
180
- ...,
181
- help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
182
- ),
183
- ] = ImageRefMode.EMBEDDED,
184
- ocr: Annotated[
185
- bool,
186
- typer.Option(
187
- ..., help="If enabled, the bitmap content will be processed using OCR."
188
- ),
189
- ] = True,
190
- force_ocr: Annotated[
191
- bool,
192
- typer.Option(
193
- ...,
194
- help="Replace any existing text with OCR generated text over the full content.",
195
- ),
196
- ] = False,
197
- ocr_engine: Annotated[
198
- OcrEngine, typer.Option(..., help="The OCR engine to use.")
199
- ] = OcrEngine.EASYOCR,
200
- ocr_lang: Annotated[
201
- Optional[str],
202
- typer.Option(
203
- ...,
204
- help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
205
- ),
206
- ] = None,
207
- pdf_backend: Annotated[
208
- PdfBackend, typer.Option(..., help="The PDF backend to use.")
209
- ] = PdfBackend.DLPARSE_V2,
210
- table_mode: Annotated[
211
- TableFormerMode,
212
- typer.Option(..., help="The mode to use in the table structure model."),
213
- ] = TableFormerMode.FAST,
214
- enrich_code: Annotated[
215
- bool,
216
- typer.Option(..., help="Enable the code enrichment model in the pipeline."),
217
- ] = False,
218
- enrich_formula: Annotated[
219
- bool,
220
- typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
221
- ] = False,
222
- enrich_picture_classes: Annotated[
223
- bool,
224
- typer.Option(
225
- ...,
226
- help="Enable the picture classification enrichment model in the pipeline.",
227
- ),
228
- ] = False,
229
- enrich_picture_description: Annotated[
230
- bool,
231
- typer.Option(..., help="Enable the picture description model in the pipeline."),
232
- ] = False,
233
- artifacts_path: Annotated[
234
- Optional[Path],
235
- typer.Option(..., help="If provided, the location of the model artifacts."),
236
- ] = None,
237
- abort_on_error: Annotated[
238
- bool,
239
- typer.Option(
240
- ...,
241
- "--abort-on-error/--no-abort-on-error",
242
- help="If enabled, the bitmap content will be processed using OCR.",
243
- ),
244
- ] = False,
245
- output: Annotated[
246
- Path, typer.Option(..., help="Output directory where results are saved.")
247
- ] = Path("."),
248
- verbose: Annotated[
249
- int,
250
- typer.Option(
251
- "--verbose",
252
- "-v",
253
- count=True,
254
- help="Set the verbosity level. -v for info logging, -vv for debug logging.",
255
- ),
256
- ] = 0,
257
- debug_visualize_cells: Annotated[
258
- bool,
259
- typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
260
- ] = False,
261
- debug_visualize_ocr: Annotated[
262
- bool,
263
- typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
264
- ] = False,
265
- debug_visualize_layout: Annotated[
266
- bool,
267
- typer.Option(
268
- ..., help="Enable debug output which visualizes the layour clusters"
269
- ),
270
- ] = False,
271
- debug_visualize_tables: Annotated[
272
- bool,
273
- typer.Option(..., help="Enable debug output which visualizes the table cells"),
274
- ] = False,
275
- version: Annotated[
276
- Optional[bool],
277
- typer.Option(
278
- "--version",
279
- callback=version_callback,
280
- is_eager=True,
281
- help="Show version information.",
282
- ),
283
- ] = None,
284
- document_timeout: Annotated[
285
- Optional[float],
286
- typer.Option(
287
- ...,
288
- help="The timeout for processing each document, in seconds.",
289
- ),
290
- ] = None,
291
- num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
292
- device: Annotated[
293
- AcceleratorDevice, typer.Option(..., help="Accelerator device")
294
- ] = AcceleratorDevice.AUTO,
295
- ):
296
- if verbose == 0:
297
- logging.basicConfig(level=logging.WARNING)
298
- elif verbose == 1:
299
- logging.basicConfig(level=logging.INFO)
300
- elif verbose == 2:
301
- logging.basicConfig(level=logging.DEBUG)
302
-
303
- settings.debug.visualize_cells = debug_visualize_cells
304
- settings.debug.visualize_layout = debug_visualize_layout
305
- settings.debug.visualize_tables = debug_visualize_tables
306
- settings.debug.visualize_ocr = debug_visualize_ocr
307
-
308
- if from_formats is None:
309
- from_formats = [e for e in InputFormat]
310
-
311
- parsed_headers: Optional[Dict[str, str]] = None
312
- if headers is not None:
313
- headers_t = TypeAdapter(Dict[str, str])
314
- parsed_headers = headers_t.validate_json(headers)
315
-
316
- with tempfile.TemporaryDirectory() as tempdir:
317
- input_doc_paths: List[Path] = []
318
- for src in input_sources:
319
- try:
320
- # check if we can fetch some remote url
321
- source = resolve_source_to_path(
322
- source=src, headers=parsed_headers, workdir=Path(tempdir)
323
- )
324
- input_doc_paths.append(source)
325
- except FileNotFoundError:
326
- err_console.print(
327
- f"[red]Error: The input file {src} does not exist.[/red]"
328
- )
329
- raise typer.Abort()
330
- except IsADirectoryError:
331
- # if the input matches to a file or a folder
332
- try:
333
- local_path = TypeAdapter(Path).validate_python(src)
334
- if local_path.exists() and local_path.is_dir():
335
- for fmt in from_formats:
336
- for ext in FormatToExtensions[fmt]:
337
- input_doc_paths.extend(
338
- list(local_path.glob(f"**/*.{ext}"))
339
- )
340
- input_doc_paths.extend(
341
- list(local_path.glob(f"**/*.{ext.upper()}"))
342
- )
343
- elif local_path.exists():
344
- input_doc_paths.append(local_path)
345
- else:
346
- err_console.print(
347
- f"[red]Error: The input file {src} does not exist.[/red]"
348
- )
349
- raise typer.Abort()
350
- except Exception as err:
351
- err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
352
- _log.info(err) # will print more details if verbose is activated
353
- raise typer.Abort()
354
-
355
- if to_formats is None:
356
- to_formats = [OutputFormat.MARKDOWN]
357
-
358
- export_json = OutputFormat.JSON in to_formats
359
- export_html = OutputFormat.HTML in to_formats
360
- export_md = OutputFormat.MARKDOWN in to_formats
361
- export_txt = OutputFormat.TEXT in to_formats
362
- export_doctags = OutputFormat.DOCTAGS in to_formats
363
-
364
- if ocr_engine == OcrEngine.EASYOCR:
365
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
366
- elif ocr_engine == OcrEngine.TESSERACT_CLI:
367
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
368
- elif ocr_engine == OcrEngine.TESSERACT:
369
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
370
- elif ocr_engine == OcrEngine.OCRMAC:
371
- ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
372
- elif ocr_engine == OcrEngine.RAPIDOCR:
373
- ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
374
- else:
375
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
376
-
377
- ocr_lang_list = _split_list(ocr_lang)
378
- if ocr_lang_list is not None:
379
- ocr_options.lang = ocr_lang_list
380
-
381
- accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
382
- pipeline_options = PdfPipelineOptions(
383
- accelerator_options=accelerator_options,
384
- do_ocr=ocr,
385
- ocr_options=ocr_options,
386
- do_table_structure=True,
387
- do_code_enrichment=enrich_code,
388
- do_formula_enrichment=enrich_formula,
389
- do_picture_description=enrich_picture_description,
390
- do_picture_classification=enrich_picture_classes,
391
- document_timeout=document_timeout,
392
- )
393
- pipeline_options.table_structure_options.do_cell_matching = (
394
- True # do_cell_matching
395
- )
396
- pipeline_options.table_structure_options.mode = table_mode
397
-
398
- if image_export_mode != ImageRefMode.PLACEHOLDER:
399
- pipeline_options.generate_page_images = True
400
- pipeline_options.generate_picture_images = (
401
- True # FIXME: to be deprecated in verson 3
402
- )
403
- pipeline_options.images_scale = 2
404
-
405
- if artifacts_path is not None:
406
- pipeline_options.artifacts_path = artifacts_path
407
-
408
- if pdf_backend == PdfBackend.DLPARSE_V1:
409
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
410
- elif pdf_backend == PdfBackend.DLPARSE_V2:
411
- backend = DoclingParseV2DocumentBackend
412
- elif pdf_backend == PdfBackend.PYPDFIUM2:
413
- backend = PyPdfiumDocumentBackend
414
- else:
415
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
416
-
417
- pdf_format_option = PdfFormatOption(
418
- pipeline_options=pipeline_options,
419
- backend=backend, # pdf_backend
420
- )
421
- format_options: Dict[InputFormat, FormatOption] = {
422
- InputFormat.PDF: pdf_format_option,
423
- InputFormat.IMAGE: pdf_format_option,
424
- }
425
- doc_converter = DocumentConverter(
426
- allowed_formats=from_formats,
427
- format_options=format_options,
428
- )
429
-
430
- start_time = time.time()
431
-
432
- conv_results = doc_converter.convert_all(
433
- input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
434
- )
435
-
436
- output.mkdir(parents=True, exist_ok=True)
437
- export_documents(
438
- conv_results,
439
- output_dir=output,
440
- export_json=export_json,
441
- export_html=export_html,
442
- export_md=export_md,
443
- export_txt=export_txt,
444
- export_doctags=export_doctags,
445
- image_export_mode=image_export_mode,
446
- )
447
-
448
- end_time = time.time() - start_time
449
-
450
- _log.info(f"All documents were converted in {end_time:.2f} seconds.")
451
-
452
-
453
- click_app = typer.main.get_command(app)
454
-
455
- if __name__ == "__main__":
456
- app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/cli/models.py DELETED
@@ -1,107 +0,0 @@
1
- import logging
2
- import warnings
3
- from enum import Enum
4
- from pathlib import Path
5
- from typing import Annotated, Optional
6
-
7
- import typer
8
- from rich.console import Console
9
- from rich.logging import RichHandler
10
-
11
- from docling.datamodel.settings import settings
12
- from docling.utils.model_downloader import download_models
13
-
14
- warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
15
- warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
16
-
17
- console = Console()
18
- err_console = Console(stderr=True)
19
-
20
-
21
- app = typer.Typer(
22
- name="Docling models helper",
23
- no_args_is_help=True,
24
- add_completion=False,
25
- pretty_exceptions_enable=False,
26
- )
27
-
28
-
29
- class _AvailableModels(str, Enum):
30
- LAYOUT = "layout"
31
- TABLEFORMER = "tableformer"
32
- CODE_FORMULA = "code_formula"
33
- PICTURE_CLASSIFIER = "picture_classifier"
34
- SMOLVLM = "smolvlm"
35
- EASYOCR = "easyocr"
36
-
37
-
38
- @app.command("download")
39
- def download(
40
- output_dir: Annotated[
41
- Path,
42
- typer.Option(
43
- ...,
44
- "-o",
45
- "--output-dir",
46
- help="The directory where all the models are downloaded.",
47
- ),
48
- ] = (settings.cache_dir / "models"),
49
- force: Annotated[
50
- bool, typer.Option(..., help="If true, the download will be forced")
51
- ] = False,
52
- models: Annotated[
53
- Optional[list[_AvailableModels]],
54
- typer.Argument(
55
- help=f"Models to download (default behavior: all will be downloaded)",
56
- ),
57
- ] = None,
58
- quiet: Annotated[
59
- bool,
60
- typer.Option(
61
- ...,
62
- "-q",
63
- "--quiet",
64
- help="No extra output is generated, the CLI prints only the directory with the cached models.",
65
- ),
66
- ] = False,
67
- ):
68
- if not quiet:
69
- FORMAT = "%(message)s"
70
- logging.basicConfig(
71
- level=logging.INFO,
72
- format="[blue]%(message)s[/blue]",
73
- datefmt="[%X]",
74
- handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
75
- )
76
- to_download = models or [m for m in _AvailableModels]
77
- output_dir = download_models(
78
- output_dir=output_dir,
79
- force=force,
80
- progress=(not quiet),
81
- with_layout=_AvailableModels.LAYOUT in to_download,
82
- with_tableformer=_AvailableModels.TABLEFORMER in to_download,
83
- with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
84
- with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
85
- with_smolvlm=_AvailableModels.SMOLVLM in to_download,
86
- with_easyocr=_AvailableModels.EASYOCR in to_download,
87
- )
88
-
89
- if quiet:
90
- typer.echo(output_dir)
91
- else:
92
- typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
93
-
94
- console.print(
95
- "\n",
96
- "Docling can now be configured for running offline using the local artifacts.\n\n",
97
- "Using the CLI:",
98
- f"`docling --artifacts-path={output_dir} FILE`",
99
- "\n",
100
- "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
101
- )
102
-
103
-
104
- click_app = typer.main.get_command(app)
105
-
106
- if __name__ == "__main__":
107
- app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/cli/tools.py DELETED
@@ -1,17 +0,0 @@
1
- import typer
2
-
3
- from docling.cli.models import app as models_app
4
-
5
- app = typer.Typer(
6
- name="Docling helpers",
7
- no_args_is_help=True,
8
- add_completion=False,
9
- pretty_exceptions_enable=False,
10
- )
11
-
12
- app.add_typer(models_app, name="models")
13
-
14
- click_app = typer.main.get_command(app)
15
-
16
- if __name__ == "__main__":
17
- app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/base_models.py DELETED
@@ -1,258 +0,0 @@
1
- from enum import Enum
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
3
-
4
- from docling_core.types.doc import (
5
- BoundingBox,
6
- DocItemLabel,
7
- NodeItem,
8
- PictureDataType,
9
- Size,
10
- TableCell,
11
- )
12
- from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
13
- DocumentStream,
14
- )
15
- from PIL.Image import Image
16
- from pydantic import BaseModel, ConfigDict
17
-
18
- if TYPE_CHECKING:
19
- from docling.backend.pdf_backend import PdfPageBackend
20
-
21
-
22
- class ConversionStatus(str, Enum):
23
- PENDING = "pending"
24
- STARTED = "started"
25
- FAILURE = "failure"
26
- SUCCESS = "success"
27
- PARTIAL_SUCCESS = "partial_success"
28
- SKIPPED = "skipped"
29
-
30
-
31
- class InputFormat(str, Enum):
32
- """A document format supported by document backend parsers."""
33
-
34
- DOCX = "docx"
35
- PPTX = "pptx"
36
- HTML = "html"
37
- XML_PUBMED = "xml_pubmed"
38
- IMAGE = "image"
39
- PDF = "pdf"
40
- ASCIIDOC = "asciidoc"
41
- MD = "md"
42
- XLSX = "xlsx"
43
- XML_USPTO = "xml_uspto"
44
- JSON_DOCLING = "json_docling"
45
-
46
-
47
- class OutputFormat(str, Enum):
48
- MARKDOWN = "md"
49
- JSON = "json"
50
- HTML = "html"
51
- TEXT = "text"
52
- DOCTAGS = "doctags"
53
-
54
-
55
- FormatToExtensions: Dict[InputFormat, List[str]] = {
56
- InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
57
- InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
58
- InputFormat.PDF: ["pdf"],
59
- InputFormat.MD: ["md"],
60
- InputFormat.HTML: ["html", "htm", "xhtml"],
61
- InputFormat.XML_PUBMED: ["xml", "nxml"],
62
- InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
63
- InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
64
- InputFormat.XLSX: ["xlsx"],
65
- InputFormat.XML_USPTO: ["xml", "txt"],
66
- InputFormat.JSON_DOCLING: ["json"],
67
- }
68
-
69
- FormatToMimeType: Dict[InputFormat, List[str]] = {
70
- InputFormat.DOCX: [
71
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
72
- "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
73
- ],
74
- InputFormat.PPTX: [
75
- "application/vnd.openxmlformats-officedocument.presentationml.template",
76
- "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
77
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
78
- ],
79
- InputFormat.HTML: ["text/html", "application/xhtml+xml"],
80
- InputFormat.XML_PUBMED: ["application/xml"],
81
- InputFormat.IMAGE: [
82
- "image/png",
83
- "image/jpeg",
84
- "image/tiff",
85
- "image/gif",
86
- "image/bmp",
87
- ],
88
- InputFormat.PDF: ["application/pdf"],
89
- InputFormat.ASCIIDOC: ["text/asciidoc"],
90
- InputFormat.MD: ["text/markdown", "text/x-markdown"],
91
- InputFormat.XLSX: [
92
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
93
- ],
94
- InputFormat.XML_USPTO: ["application/xml", "text/plain"],
95
- InputFormat.JSON_DOCLING: ["application/json"],
96
- }
97
-
98
- MimeTypeToFormat: dict[str, list[InputFormat]] = {
99
- mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
100
- for value in FormatToMimeType.values()
101
- for mime in value
102
- }
103
-
104
-
105
- class DocInputType(str, Enum):
106
- PATH = "path"
107
- STREAM = "stream"
108
-
109
-
110
- class DoclingComponentType(str, Enum):
111
- DOCUMENT_BACKEND = "document_backend"
112
- MODEL = "model"
113
- DOC_ASSEMBLER = "doc_assembler"
114
- USER_INPUT = "user_input"
115
-
116
-
117
- class ErrorItem(BaseModel):
118
- component_type: DoclingComponentType
119
- module_name: str
120
- error_message: str
121
-
122
-
123
- class Cell(BaseModel):
124
- id: int
125
- text: str
126
- bbox: BoundingBox
127
-
128
-
129
- class OcrCell(Cell):
130
- confidence: float
131
-
132
-
133
- class Cluster(BaseModel):
134
- id: int
135
- label: DocItemLabel
136
- bbox: BoundingBox
137
- confidence: float = 1.0
138
- cells: List[Cell] = []
139
- children: List["Cluster"] = [] # Add child cluster support
140
-
141
-
142
- class BasePageElement(BaseModel):
143
- label: DocItemLabel
144
- id: int
145
- page_no: int
146
- cluster: Cluster
147
- text: Optional[str] = None
148
-
149
-
150
- class LayoutPrediction(BaseModel):
151
- clusters: List[Cluster] = []
152
-
153
-
154
- class ContainerElement(
155
- BasePageElement
156
- ): # Used for Form and Key-Value-Regions, only for typing.
157
- pass
158
-
159
-
160
- class Table(BasePageElement):
161
- otsl_seq: List[str]
162
- num_rows: int = 0
163
- num_cols: int = 0
164
- table_cells: List[TableCell]
165
-
166
-
167
- class TableStructurePrediction(BaseModel):
168
- table_map: Dict[int, Table] = {}
169
-
170
-
171
- class TextElement(BasePageElement):
172
- text: str
173
-
174
-
175
- class FigureElement(BasePageElement):
176
- annotations: List[PictureDataType] = []
177
- provenance: Optional[str] = None
178
- predicted_class: Optional[str] = None
179
- confidence: Optional[float] = None
180
-
181
-
182
- class FigureClassificationPrediction(BaseModel):
183
- figure_count: int = 0
184
- figure_map: Dict[int, FigureElement] = {}
185
-
186
-
187
- class EquationPrediction(BaseModel):
188
- equation_count: int = 0
189
- equation_map: Dict[int, TextElement] = {}
190
-
191
-
192
- class PagePredictions(BaseModel):
193
- layout: Optional[LayoutPrediction] = None
194
- tablestructure: Optional[TableStructurePrediction] = None
195
- figures_classification: Optional[FigureClassificationPrediction] = None
196
- equations_prediction: Optional[EquationPrediction] = None
197
-
198
-
199
- PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
200
-
201
-
202
- class AssembledUnit(BaseModel):
203
- elements: List[PageElement] = []
204
- body: List[PageElement] = []
205
- headers: List[PageElement] = []
206
-
207
-
208
- class ItemAndImageEnrichmentElement(BaseModel):
209
- model_config = ConfigDict(arbitrary_types_allowed=True)
210
-
211
- item: NodeItem
212
- image: Image
213
-
214
-
215
- class Page(BaseModel):
216
- model_config = ConfigDict(arbitrary_types_allowed=True)
217
-
218
- page_no: int
219
- # page_hash: Optional[str] = None
220
- size: Optional[Size] = None
221
- cells: List[Cell] = []
222
- predictions: PagePredictions = PagePredictions()
223
- assembled: Optional[AssembledUnit] = None
224
-
225
- _backend: Optional["PdfPageBackend"] = (
226
- None # Internal PDF backend. By default it is cleared during assembling.
227
- )
228
- _default_image_scale: float = 1.0 # Default image scale for external usage.
229
- _image_cache: Dict[float, Image] = (
230
- {}
231
- ) # Cache of images in different scales. By default it is cleared during assembling.
232
-
233
- def get_image(
234
- self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
235
- ) -> Optional[Image]:
236
- if self._backend is None:
237
- return self._image_cache.get(scale, None)
238
-
239
- if not scale in self._image_cache:
240
- if cropbox is None:
241
- self._image_cache[scale] = self._backend.get_page_image(scale=scale)
242
- else:
243
- return self._backend.get_page_image(scale=scale, cropbox=cropbox)
244
-
245
- if cropbox is None:
246
- return self._image_cache[scale]
247
- else:
248
- page_im = self._image_cache[scale]
249
- assert self.size is not None
250
- return page_im.crop(
251
- cropbox.to_top_left_origin(page_height=self.size.height)
252
- .scaled(scale=scale)
253
- .as_tuple()
254
- )
255
-
256
- @property
257
- def image(self) -> Optional[Image]:
258
- return self.get_image(scale=self._default_image_scale)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/document.py DELETED
@@ -1,394 +0,0 @@
1
- import logging
2
- import re
3
- from enum import Enum
4
- from io import BytesIO
5
- from pathlib import Path, PurePath
6
- from typing import (
7
- TYPE_CHECKING,
8
- Dict,
9
- Iterable,
10
- List,
11
- Literal,
12
- Optional,
13
- Set,
14
- Type,
15
- Union,
16
- )
17
-
18
- import filetype
19
- from docling_core.types.doc import (
20
- DocItem,
21
- DocItemLabel,
22
- DoclingDocument,
23
- PictureItem,
24
- SectionHeaderItem,
25
- TableItem,
26
- TextItem,
27
- )
28
- from docling_core.types.doc.document import ListItem
29
- from docling_core.types.legacy_doc.base import (
30
- BaseText,
31
- Figure,
32
- GlmTableCell,
33
- PageDimensions,
34
- PageReference,
35
- Prov,
36
- Ref,
37
- )
38
- from docling_core.types.legacy_doc.base import Table as DsSchemaTable
39
- from docling_core.types.legacy_doc.base import TableCell
40
- from docling_core.types.legacy_doc.document import (
41
- CCSDocumentDescription as DsDocumentDescription,
42
- )
43
- from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
44
- from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
45
- from docling_core.utils.file import resolve_source_to_stream
46
- from docling_core.utils.legacy import docling_document_to_legacy
47
- from pydantic import BaseModel
48
- from typing_extensions import deprecated
49
-
50
- from docling.backend.abstract_backend import (
51
- AbstractDocumentBackend,
52
- PaginatedDocumentBackend,
53
- )
54
- from docling.datamodel.base_models import (
55
- AssembledUnit,
56
- ConversionStatus,
57
- DocumentStream,
58
- ErrorItem,
59
- FormatToExtensions,
60
- FormatToMimeType,
61
- InputFormat,
62
- MimeTypeToFormat,
63
- Page,
64
- )
65
- from docling.datamodel.settings import DocumentLimits
66
- from docling.utils.profiling import ProfilingItem
67
- from docling.utils.utils import create_file_hash, create_hash
68
-
69
- if TYPE_CHECKING:
70
- from docling.document_converter import FormatOption
71
-
72
- _log = logging.getLogger(__name__)
73
-
74
- layout_label_to_ds_type = {
75
- DocItemLabel.TITLE: "title",
76
- DocItemLabel.DOCUMENT_INDEX: "table",
77
- DocItemLabel.SECTION_HEADER: "subtitle-level-1",
78
- DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
79
- DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
80
- DocItemLabel.CAPTION: "caption",
81
- DocItemLabel.PAGE_HEADER: "page-header",
82
- DocItemLabel.PAGE_FOOTER: "page-footer",
83
- DocItemLabel.FOOTNOTE: "footnote",
84
- DocItemLabel.TABLE: "table",
85
- DocItemLabel.FORMULA: "equation",
86
- DocItemLabel.LIST_ITEM: "paragraph",
87
- DocItemLabel.CODE: "paragraph",
88
- DocItemLabel.PICTURE: "figure",
89
- DocItemLabel.TEXT: "paragraph",
90
- DocItemLabel.PARAGRAPH: "paragraph",
91
- DocItemLabel.FORM: DocItemLabel.FORM.value,
92
- DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
93
- }
94
-
95
- _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
96
-
97
-
98
- class InputDocument(BaseModel):
99
- file: PurePath
100
- document_hash: str # = None
101
- valid: bool = True
102
- limits: DocumentLimits = DocumentLimits()
103
- format: InputFormat # = None
104
-
105
- filesize: Optional[int] = None
106
- page_count: int = 0
107
-
108
- _backend: AbstractDocumentBackend # Internal PDF backend used
109
-
110
- def __init__(
111
- self,
112
- path_or_stream: Union[BytesIO, Path],
113
- format: InputFormat,
114
- backend: Type[AbstractDocumentBackend],
115
- filename: Optional[str] = None,
116
- limits: Optional[DocumentLimits] = None,
117
- ):
118
- super().__init__(
119
- file="", document_hash="", format=InputFormat.PDF
120
- ) # initialize with dummy values
121
-
122
- self.limits = limits or DocumentLimits()
123
- self.format = format
124
-
125
- try:
126
- if isinstance(path_or_stream, Path):
127
- self.file = path_or_stream
128
- self.filesize = path_or_stream.stat().st_size
129
- if self.filesize > self.limits.max_file_size:
130
- self.valid = False
131
- else:
132
- self.document_hash = create_file_hash(path_or_stream)
133
- self._init_doc(backend, path_or_stream)
134
-
135
- elif isinstance(path_or_stream, BytesIO):
136
- assert (
137
- filename is not None
138
- ), "Can't construct InputDocument from stream without providing filename arg."
139
- self.file = PurePath(filename)
140
- self.filesize = path_or_stream.getbuffer().nbytes
141
-
142
- if self.filesize > self.limits.max_file_size:
143
- self.valid = False
144
- else:
145
- self.document_hash = create_file_hash(path_or_stream)
146
- self._init_doc(backend, path_or_stream)
147
- else:
148
- raise RuntimeError(
149
- f"Unexpected type path_or_stream: {type(path_or_stream)}"
150
- )
151
-
152
- # For paginated backends, check if the maximum page count is exceeded.
153
- if self.valid and self._backend.is_valid():
154
- if self._backend.supports_pagination() and isinstance(
155
- self._backend, PaginatedDocumentBackend
156
- ):
157
- self.page_count = self._backend.page_count()
158
- if not self.page_count <= self.limits.max_num_pages:
159
- self.valid = False
160
- elif self.page_count < self.limits.page_range[0]:
161
- self.valid = False
162
-
163
- except (FileNotFoundError, OSError) as e:
164
- self.valid = False
165
- _log.exception(
166
- f"File {self.file.name} not found or cannot be opened.", exc_info=e
167
- )
168
- # raise
169
- except RuntimeError as e:
170
- self.valid = False
171
- _log.exception(
172
- f"An unexpected error occurred while opening the document {self.file.name}",
173
- exc_info=e,
174
- )
175
- # raise
176
-
177
- def _init_doc(
178
- self,
179
- backend: Type[AbstractDocumentBackend],
180
- path_or_stream: Union[BytesIO, Path],
181
- ) -> None:
182
- self._backend = backend(self, path_or_stream=path_or_stream)
183
- if not self._backend.is_valid():
184
- self.valid = False
185
-
186
-
187
- class DocumentFormat(str, Enum):
188
- V2 = "v2"
189
- V1 = "v1"
190
-
191
-
192
- class ConversionResult(BaseModel):
193
- input: InputDocument
194
-
195
- status: ConversionStatus = ConversionStatus.PENDING # failure, success
196
- errors: List[ErrorItem] = [] # structure to keep errors
197
-
198
- pages: List[Page] = []
199
- assembled: AssembledUnit = AssembledUnit()
200
- timings: Dict[str, ProfilingItem] = {}
201
-
202
- document: DoclingDocument = _EMPTY_DOCLING_DOC
203
-
204
- @property
205
- @deprecated("Use document instead.")
206
- def legacy_document(self):
207
- return docling_document_to_legacy(self.document)
208
-
209
-
210
- class _DummyBackend(AbstractDocumentBackend):
211
- def __init__(self, *args, **kwargs):
212
- super().__init__(*args, **kwargs)
213
-
214
- def is_valid(self) -> bool:
215
- return False
216
-
217
- @classmethod
218
- def supported_formats(cls) -> Set[InputFormat]:
219
- return set()
220
-
221
- @classmethod
222
- def supports_pagination(cls) -> bool:
223
- return False
224
-
225
- def unload(self):
226
- return super().unload()
227
-
228
-
229
- class _DocumentConversionInput(BaseModel):
230
-
231
- path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
232
- headers: Optional[Dict[str, str]] = None
233
- limits: Optional[DocumentLimits] = DocumentLimits()
234
-
235
- def docs(
236
- self, format_options: Dict[InputFormat, "FormatOption"]
237
- ) -> Iterable[InputDocument]:
238
- for item in self.path_or_stream_iterator:
239
- obj = (
240
- resolve_source_to_stream(item, self.headers)
241
- if isinstance(item, str)
242
- else item
243
- )
244
- format = self._guess_format(obj)
245
- backend: Type[AbstractDocumentBackend]
246
- if format not in format_options.keys():
247
- _log.error(
248
- f"Input document {obj.name} does not match any allowed format."
249
- )
250
- backend = _DummyBackend
251
- else:
252
- backend = format_options[format].backend
253
-
254
- if isinstance(obj, Path):
255
- yield InputDocument(
256
- path_or_stream=obj,
257
- format=format, # type: ignore[arg-type]
258
- filename=obj.name,
259
- limits=self.limits,
260
- backend=backend,
261
- )
262
- elif isinstance(obj, DocumentStream):
263
- yield InputDocument(
264
- path_or_stream=obj.stream,
265
- format=format, # type: ignore[arg-type]
266
- filename=obj.name,
267
- limits=self.limits,
268
- backend=backend,
269
- )
270
- else:
271
- raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
272
-
273
- def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
274
- content = b"" # empty binary blob
275
- formats: list[InputFormat] = []
276
-
277
- if isinstance(obj, Path):
278
- mime = filetype.guess_mime(str(obj))
279
- if mime is None:
280
- ext = obj.suffix[1:]
281
- mime = _DocumentConversionInput._mime_from_extension(ext)
282
- if mime is None: # must guess from
283
- with obj.open("rb") as f:
284
- content = f.read(1024) # Read first 1KB
285
-
286
- elif isinstance(obj, DocumentStream):
287
- content = obj.stream.read(8192)
288
- obj.stream.seek(0)
289
- mime = filetype.guess_mime(content)
290
- if mime is None:
291
- ext = (
292
- obj.name.rsplit(".", 1)[-1]
293
- if ("." in obj.name and not obj.name.startswith("."))
294
- else ""
295
- )
296
- mime = _DocumentConversionInput._mime_from_extension(ext)
297
-
298
- mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
299
- mime = mime or "text/plain"
300
- formats = MimeTypeToFormat.get(mime, [])
301
- if formats:
302
- if len(formats) == 1 and mime not in ("text/plain"):
303
- return formats[0]
304
- else: # ambiguity in formats
305
- return _DocumentConversionInput._guess_from_content(
306
- content, mime, formats
307
- )
308
- else:
309
- return None
310
-
311
- @staticmethod
312
- def _guess_from_content(
313
- content: bytes, mime: str, formats: list[InputFormat]
314
- ) -> Optional[InputFormat]:
315
- """Guess the input format of a document by checking part of its content."""
316
- input_format: Optional[InputFormat] = None
317
- content_str = content.decode("utf-8")
318
-
319
- if mime == "application/xml":
320
- match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
321
- if match_doctype:
322
- xml_doctype = match_doctype.group()
323
- if InputFormat.XML_USPTO in formats and any(
324
- item in xml_doctype
325
- for item in (
326
- "us-patent-application-v4",
327
- "us-patent-grant-v4",
328
- "us-grant-025",
329
- "patent-application-publication",
330
- )
331
- ):
332
- input_format = InputFormat.XML_USPTO
333
-
334
- if (
335
- InputFormat.XML_PUBMED in formats
336
- and "/NLM//DTD JATS" in xml_doctype
337
- ):
338
- input_format = InputFormat.XML_PUBMED
339
-
340
- elif mime == "text/plain":
341
- if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
342
- input_format = InputFormat.XML_USPTO
343
-
344
- return input_format
345
-
346
- @staticmethod
347
- def _mime_from_extension(ext):
348
- mime = None
349
- if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
350
- mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
351
- elif ext in FormatToExtensions[InputFormat.HTML]:
352
- mime = FormatToMimeType[InputFormat.HTML][0]
353
- elif ext in FormatToExtensions[InputFormat.MD]:
354
- mime = FormatToMimeType[InputFormat.MD][0]
355
- elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
356
- mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
357
- elif ext in FormatToExtensions[InputFormat.PDF]:
358
- mime = FormatToMimeType[InputFormat.PDF][0]
359
- return mime
360
-
361
- @staticmethod
362
- def _detect_html_xhtml(
363
- content: bytes,
364
- ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
365
- """Guess the mime type of an XHTML, HTML, or XML file from its content.
366
-
367
- Args:
368
- content: A short piece of a document from its beginning.
369
-
370
- Returns:
371
- The mime type of an XHTML, HTML, or XML file, or None if the content does
372
- not match any of these formats.
373
- """
374
- content_str = content.decode("ascii", errors="ignore").lower()
375
- # Remove XML comments
376
- content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
377
- content_str = content_str.lstrip()
378
-
379
- if re.match(r"<\?xml", content_str):
380
- if "xhtml" in content_str[:1000]:
381
- return "application/xhtml+xml"
382
- else:
383
- return "application/xml"
384
-
385
- if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
386
- return "text/html"
387
-
388
- p = re.compile(
389
- r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
390
- )
391
- if p.search(content_str):
392
- return "application/xml"
393
-
394
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/pipeline_options.py DELETED
@@ -1,296 +0,0 @@
1
- import logging
2
- import os
3
- from enum import Enum
4
- from pathlib import Path
5
- from typing import Annotated, Any, Dict, List, Literal, Optional, Union
6
-
7
- from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
8
- from pydantic_settings import BaseSettings, SettingsConfigDict
9
-
10
- _log = logging.getLogger(__name__)
11
-
12
-
13
- class AcceleratorDevice(str, Enum):
14
- """Devices to run model inference"""
15
-
16
- AUTO = "auto"
17
- CPU = "cpu"
18
- CUDA = "cuda"
19
- MPS = "mps"
20
-
21
-
22
- class AcceleratorOptions(BaseSettings):
23
- model_config = SettingsConfigDict(
24
- env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
25
- )
26
-
27
- num_threads: int = 4
28
- device: AcceleratorDevice = AcceleratorDevice.AUTO
29
-
30
- @model_validator(mode="before")
31
- @classmethod
32
- def check_alternative_envvars(cls, data: Any) -> Any:
33
- r"""
34
- Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
35
- The alternative envvar is used only if it is valid and the regular envvar is not set.
36
-
37
- Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
38
- the same functionality. In case the alias envvar is set and the user tries to override the
39
- parameter in settings initialization, Pydantic treats the parameter provided in __init__()
40
- as an extra input instead of simply overwriting the evvar value for that parameter.
41
- """
42
- if isinstance(data, dict):
43
- input_num_threads = data.get("num_threads")
44
-
45
- # Check if to set the num_threads from the alternative envvar
46
- if input_num_threads is None:
47
- docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
48
- omp_num_threads = os.getenv("OMP_NUM_THREADS")
49
- if docling_num_threads is None and omp_num_threads is not None:
50
- try:
51
- data["num_threads"] = int(omp_num_threads)
52
- except ValueError:
53
- _log.error(
54
- "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
55
- omp_num_threads,
56
- )
57
- return data
58
-
59
-
60
- class TableFormerMode(str, Enum):
61
- """Modes for the TableFormer model."""
62
-
63
- FAST = "fast"
64
- ACCURATE = "accurate"
65
-
66
-
67
- class TableStructureOptions(BaseModel):
68
- """Options for the table structure."""
69
-
70
- do_cell_matching: bool = (
71
- True
72
- # True: Matches predictions back to PDF cells. Can break table output if PDF cells
73
- # are merged across table columns.
74
- # False: Let table structure model define the text cells, ignore PDF cells.
75
- )
76
- mode: TableFormerMode = TableFormerMode.FAST
77
-
78
-
79
- class OcrOptions(BaseModel):
80
- """OCR options."""
81
-
82
- kind: str
83
- lang: List[str]
84
- force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
85
- bitmap_area_threshold: float = (
86
- 0.05 # percentage of the area for a bitmap to processed with OCR
87
- )
88
-
89
-
90
- class RapidOcrOptions(OcrOptions):
91
- """Options for the RapidOCR engine."""
92
-
93
- kind: Literal["rapidocr"] = "rapidocr"
94
-
95
- # English and chinese are the most commly used models and have been tested with RapidOCR.
96
- lang: List[str] = [
97
- "english",
98
- "chinese",
99
- ] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
100
- # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
101
-
102
- # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
103
- text_score: float = 0.5 # same default as rapidocr
104
-
105
- use_det: Optional[bool] = None # same default as rapidocr
106
- use_cls: Optional[bool] = None # same default as rapidocr
107
- use_rec: Optional[bool] = None # same default as rapidocr
108
-
109
- # class Device(Enum):
110
- # CPU = "CPU"
111
- # CUDA = "CUDA"
112
- # DIRECTML = "DIRECTML"
113
- # AUTO = "AUTO"
114
-
115
- # device: Device = Device.AUTO # Default value is AUTO
116
-
117
- print_verbose: bool = False # same default as rapidocr
118
-
119
- det_model_path: Optional[str] = None # same default as rapidocr
120
- cls_model_path: Optional[str] = None # same default as rapidocr
121
- rec_model_path: Optional[str] = None # same default as rapidocr
122
- rec_keys_path: Optional[str] = None # same default as rapidocr
123
-
124
- model_config = ConfigDict(
125
- extra="forbid",
126
- )
127
-
128
-
129
- class EasyOcrOptions(OcrOptions):
130
- """Options for the EasyOCR engine."""
131
-
132
- kind: Literal["easyocr"] = "easyocr"
133
- lang: List[str] = ["fr", "de", "es", "en"]
134
-
135
- use_gpu: Optional[bool] = None
136
-
137
- confidence_threshold: float = 0.5
138
-
139
- model_storage_directory: Optional[str] = None
140
- recog_network: Optional[str] = "standard"
141
- download_enabled: bool = True
142
-
143
- model_config = ConfigDict(
144
- extra="forbid",
145
- protected_namespaces=(),
146
- )
147
-
148
-
149
- class TesseractCliOcrOptions(OcrOptions):
150
- """Options for the TesseractCli engine."""
151
-
152
- kind: Literal["tesseract"] = "tesseract"
153
- lang: List[str] = ["fra", "deu", "spa", "eng"]
154
- tesseract_cmd: str = "tesseract"
155
- path: Optional[str] = None
156
-
157
- model_config = ConfigDict(
158
- extra="forbid",
159
- )
160
-
161
-
162
- class TesseractOcrOptions(OcrOptions):
163
- """Options for the Tesseract engine."""
164
-
165
- kind: Literal["tesserocr"] = "tesserocr"
166
- lang: List[str] = ["fra", "deu", "spa", "eng"]
167
- path: Optional[str] = None
168
-
169
- model_config = ConfigDict(
170
- extra="forbid",
171
- )
172
-
173
-
174
- class OcrMacOptions(OcrOptions):
175
- """Options for the Mac OCR engine."""
176
-
177
- kind: Literal["ocrmac"] = "ocrmac"
178
- lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
179
- recognition: str = "accurate"
180
- framework: str = "vision"
181
-
182
- model_config = ConfigDict(
183
- extra="forbid",
184
- )
185
-
186
-
187
- class PictureDescriptionBaseOptions(BaseModel):
188
- kind: str
189
- batch_size: int = 8
190
- scale: float = 2
191
-
192
- bitmap_area_threshold: float = (
193
- 0.2 # percentage of the area for a bitmap to processed with the models
194
- )
195
-
196
-
197
- class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
198
- kind: Literal["api"] = "api"
199
-
200
- url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
201
- headers: Dict[str, str] = {}
202
- params: Dict[str, Any] = {}
203
- timeout: float = 20
204
-
205
- prompt: str = "Describe this image in a few sentences."
206
- provenance: str = ""
207
-
208
-
209
- class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
210
- kind: Literal["vlm"] = "vlm"
211
-
212
- repo_id: str
213
- prompt: str = "Describe this image in a few sentences."
214
- # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
215
- generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
216
-
217
- @property
218
- def repo_cache_folder(self) -> str:
219
- return self.repo_id.replace("/", "--")
220
-
221
-
222
- smolvlm_picture_description = PictureDescriptionVlmOptions(
223
- repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
224
- )
225
- # phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
226
- granite_picture_description = PictureDescriptionVlmOptions(
227
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
228
- prompt="What is shown in this image?",
229
- )
230
-
231
-
232
- # Define an enum for the backend options
233
- class PdfBackend(str, Enum):
234
- """Enum of valid PDF backends."""
235
-
236
- PYPDFIUM2 = "pypdfium2"
237
- DLPARSE_V1 = "dlparse_v1"
238
- DLPARSE_V2 = "dlparse_v2"
239
-
240
-
241
- # Define an enum for the ocr engines
242
- class OcrEngine(str, Enum):
243
- """Enum of valid OCR engines."""
244
-
245
- EASYOCR = "easyocr"
246
- TESSERACT_CLI = "tesseract_cli"
247
- TESSERACT = "tesseract"
248
- OCRMAC = "ocrmac"
249
- RAPIDOCR = "rapidocr"
250
-
251
-
252
- class PipelineOptions(BaseModel):
253
- """Base pipeline options."""
254
-
255
- create_legacy_output: bool = (
256
- True # This default will be set to False on a future version of docling
257
- )
258
- document_timeout: Optional[float] = None
259
- accelerator_options: AcceleratorOptions = AcceleratorOptions()
260
-
261
-
262
- class PdfPipelineOptions(PipelineOptions):
263
- """Options for the PDF pipeline."""
264
-
265
- artifacts_path: Optional[Union[Path, str]] = None
266
- do_table_structure: bool = True # True: perform table structure extraction
267
- do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
268
- do_code_enrichment: bool = False # True: perform code OCR
269
- do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
270
- do_picture_classification: bool = False # True: classify pictures in documents
271
- do_picture_description: bool = False # True: run describe pictures in documents
272
-
273
- table_structure_options: TableStructureOptions = TableStructureOptions()
274
- ocr_options: Union[
275
- EasyOcrOptions,
276
- TesseractCliOcrOptions,
277
- TesseractOcrOptions,
278
- OcrMacOptions,
279
- RapidOcrOptions,
280
- ] = Field(EasyOcrOptions(), discriminator="kind")
281
- picture_description_options: Annotated[
282
- Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
283
- Field(discriminator="kind"),
284
- ] = smolvlm_picture_description
285
-
286
- images_scale: float = 1.0
287
- generate_page_images: bool = False
288
- generate_picture_images: bool = False
289
- generate_table_images: bool = Field(
290
- default=False,
291
- deprecated=(
292
- "Field `generate_table_images` is deprecated. "
293
- "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
294
- "before conversion and then use the `TableItem.get_image` function."
295
- ),
296
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/settings.py DELETED
@@ -1,67 +0,0 @@
1
- import sys
2
- from pathlib import Path
3
- from typing import Annotated, Tuple
4
-
5
- from pydantic import BaseModel, PlainValidator
6
- from pydantic_settings import BaseSettings, SettingsConfigDict
7
-
8
-
9
- def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
10
- if v[0] < 1 or v[1] < v[0]:
11
- raise ValueError(
12
- "Invalid page range: start must be ≥ 1 and end must be ≥ start."
13
- )
14
- return v
15
-
16
-
17
- PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
18
-
19
- DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
20
-
21
-
22
- class DocumentLimits(BaseModel):
23
- max_num_pages: int = sys.maxsize
24
- max_file_size: int = sys.maxsize
25
- page_range: PageRange = DEFAULT_PAGE_RANGE
26
-
27
-
28
- class BatchConcurrencySettings(BaseModel):
29
- doc_batch_size: int = 2
30
- doc_batch_concurrency: int = 2
31
- page_batch_size: int = 4
32
- page_batch_concurrency: int = 2
33
- elements_batch_size: int = 16
34
-
35
- # doc_batch_size: int = 1
36
- # doc_batch_concurrency: int = 1
37
- # page_batch_size: int = 1
38
- # page_batch_concurrency: int = 1
39
-
40
- # model_concurrency: int = 2
41
-
42
- # To force models into single core: export OMP_NUM_THREADS=1
43
-
44
-
45
- class DebugSettings(BaseModel):
46
- visualize_cells: bool = False
47
- visualize_ocr: bool = False
48
- visualize_layout: bool = False
49
- visualize_raw_layout: bool = False
50
- visualize_tables: bool = False
51
-
52
- profile_pipeline_timings: bool = False
53
-
54
- # Path used to output debug information.
55
- debug_output_path: str = str(Path.cwd() / "debug")
56
-
57
-
58
- class AppSettings(BaseSettings):
59
- model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
60
-
61
- perf: BatchConcurrencySettings
62
- debug: DebugSettings
63
-
64
- cache_dir: Path = Path.home() / ".cache" / "docling"
65
-
66
-
67
- settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/document_converter.py DELETED
@@ -1,348 +0,0 @@
1
- import logging
2
- import math
3
- import sys
4
- import time
5
- from functools import partial
6
- from pathlib import Path
7
- from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
8
-
9
- from pydantic import BaseModel, ConfigDict, model_validator, validate_call
10
-
11
- from docling.backend.abstract_backend import AbstractDocumentBackend
12
- from docling.backend.asciidoc_backend import AsciiDocBackend
13
- from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
14
- from docling.backend.html_backend import HTMLDocumentBackend
15
- from docling.backend.json.docling_json_backend import DoclingJSONBackend
16
- from docling.backend.md_backend import MarkdownDocumentBackend
17
- from docling.backend.msexcel_backend import MsExcelDocumentBackend
18
- from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
19
- from docling.backend.msword_backend import MsWordDocumentBackend
20
- from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
21
- from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
22
- from docling.datamodel.base_models import (
23
- ConversionStatus,
24
- DoclingComponentType,
25
- DocumentStream,
26
- ErrorItem,
27
- InputFormat,
28
- )
29
- from docling.datamodel.document import (
30
- ConversionResult,
31
- InputDocument,
32
- _DocumentConversionInput,
33
- )
34
- from docling.datamodel.pipeline_options import PipelineOptions
35
- from docling.datamodel.settings import (
36
- DEFAULT_PAGE_RANGE,
37
- DocumentLimits,
38
- PageRange,
39
- settings,
40
- )
41
- from docling.exceptions import ConversionError
42
- from docling.pipeline.base_pipeline import BasePipeline
43
- from docling.pipeline.simple_pipeline import SimplePipeline
44
- from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
45
- from docling.utils.utils import chunkify
46
-
47
- _log = logging.getLogger(__name__)
48
-
49
-
50
- class FormatOption(BaseModel):
51
- pipeline_cls: Type[BasePipeline]
52
- pipeline_options: Optional[PipelineOptions] = None
53
- backend: Type[AbstractDocumentBackend]
54
-
55
- model_config = ConfigDict(arbitrary_types_allowed=True)
56
-
57
- @model_validator(mode="after")
58
- def set_optional_field_default(self) -> "FormatOption":
59
- if self.pipeline_options is None:
60
- self.pipeline_options = self.pipeline_cls.get_default_options()
61
- return self
62
-
63
-
64
- class ExcelFormatOption(FormatOption):
65
- pipeline_cls: Type = SimplePipeline
66
- backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
67
-
68
-
69
- class WordFormatOption(FormatOption):
70
- pipeline_cls: Type = SimplePipeline
71
- backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
72
-
73
-
74
- class PowerpointFormatOption(FormatOption):
75
- pipeline_cls: Type = SimplePipeline
76
- backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
77
-
78
-
79
- class MarkdownFormatOption(FormatOption):
80
- pipeline_cls: Type = SimplePipeline
81
- backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
82
-
83
-
84
- class AsciiDocFormatOption(FormatOption):
85
- pipeline_cls: Type = SimplePipeline
86
- backend: Type[AbstractDocumentBackend] = AsciiDocBackend
87
-
88
-
89
- class HTMLFormatOption(FormatOption):
90
- pipeline_cls: Type = SimplePipeline
91
- backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
92
-
93
-
94
- class PatentUsptoFormatOption(FormatOption):
95
- pipeline_cls: Type = SimplePipeline
96
- backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
97
-
98
-
99
- class XMLPubMedFormatOption(FormatOption):
100
- pipeline_cls: Type = SimplePipeline
101
- backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
102
-
103
-
104
- class ImageFormatOption(FormatOption):
105
- pipeline_cls: Type = StandardPdfPipeline
106
- backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
107
-
108
-
109
- class PdfFormatOption(FormatOption):
110
- pipeline_cls: Type = StandardPdfPipeline
111
- backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
112
-
113
-
114
- def _get_default_option(format: InputFormat) -> FormatOption:
115
- format_to_default_options = {
116
- InputFormat.XLSX: FormatOption(
117
- pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
118
- ),
119
- InputFormat.DOCX: FormatOption(
120
- pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
121
- ),
122
- InputFormat.PPTX: FormatOption(
123
- pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
124
- ),
125
- InputFormat.MD: FormatOption(
126
- pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
127
- ),
128
- InputFormat.ASCIIDOC: FormatOption(
129
- pipeline_cls=SimplePipeline, backend=AsciiDocBackend
130
- ),
131
- InputFormat.HTML: FormatOption(
132
- pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
133
- ),
134
- InputFormat.XML_USPTO: FormatOption(
135
- pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
136
- ),
137
- InputFormat.XML_PUBMED: FormatOption(
138
- pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
139
- ),
140
- InputFormat.IMAGE: FormatOption(
141
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
142
- ),
143
- InputFormat.PDF: FormatOption(
144
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
145
- ),
146
- InputFormat.JSON_DOCLING: FormatOption(
147
- pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
148
- ),
149
- }
150
- if (options := format_to_default_options.get(format)) is not None:
151
- return options
152
- else:
153
- raise RuntimeError(f"No default options configured for {format}")
154
-
155
-
156
- class DocumentConverter:
157
- _default_download_filename = "file"
158
-
159
- def __init__(
160
- self,
161
- allowed_formats: Optional[List[InputFormat]] = None,
162
- format_options: Optional[Dict[InputFormat, FormatOption]] = None,
163
- ):
164
- self.allowed_formats = (
165
- allowed_formats if allowed_formats is not None else [e for e in InputFormat]
166
- )
167
- self.format_to_options = {
168
- format: (
169
- _get_default_option(format=format)
170
- if (custom_option := (format_options or {}).get(format)) is None
171
- else custom_option
172
- )
173
- for format in self.allowed_formats
174
- }
175
- self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
176
-
177
- def initialize_pipeline(self, format: InputFormat):
178
- """Initialize the conversion pipeline for the selected format."""
179
- pipeline = self._get_pipeline(doc_format=format)
180
- if pipeline is None:
181
- raise ConversionError(
182
- f"No pipeline could be initialized for format {format}"
183
- )
184
-
185
- @validate_call(config=ConfigDict(strict=True))
186
- def convert(
187
- self,
188
- source: Union[Path, str, DocumentStream], # TODO review naming
189
- headers: Optional[Dict[str, str]] = None,
190
- raises_on_error: bool = True,
191
- max_num_pages: int = sys.maxsize,
192
- max_file_size: int = sys.maxsize,
193
- page_range: PageRange = DEFAULT_PAGE_RANGE,
194
- ) -> ConversionResult:
195
- all_res = self.convert_all(
196
- source=[source],
197
- raises_on_error=raises_on_error,
198
- max_num_pages=max_num_pages,
199
- max_file_size=max_file_size,
200
- headers=headers,
201
- page_range=page_range,
202
- )
203
- return next(all_res)
204
-
205
- @validate_call(config=ConfigDict(strict=True))
206
- def convert_all(
207
- self,
208
- source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
209
- headers: Optional[Dict[str, str]] = None,
210
- raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
211
- max_num_pages: int = sys.maxsize,
212
- max_file_size: int = sys.maxsize,
213
- page_range: PageRange = DEFAULT_PAGE_RANGE,
214
- ) -> Iterator[ConversionResult]:
215
- limits = DocumentLimits(
216
- max_num_pages=max_num_pages,
217
- max_file_size=max_file_size,
218
- page_range=page_range,
219
- )
220
- conv_input = _DocumentConversionInput(
221
- path_or_stream_iterator=source, limits=limits, headers=headers
222
- )
223
- conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
224
-
225
- had_result = False
226
- for conv_res in conv_res_iter:
227
- had_result = True
228
- if raises_on_error and conv_res.status not in {
229
- ConversionStatus.SUCCESS,
230
- ConversionStatus.PARTIAL_SUCCESS,
231
- }:
232
- raise ConversionError(
233
- f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
234
- )
235
- else:
236
- yield conv_res
237
-
238
- if not had_result and raises_on_error:
239
- raise ConversionError(
240
- f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
241
- )
242
-
243
- def _convert(
244
- self, conv_input: _DocumentConversionInput, raises_on_error: bool
245
- ) -> Iterator[ConversionResult]:
246
- start_time = time.monotonic()
247
-
248
- for input_batch in chunkify(
249
- conv_input.docs(self.format_to_options),
250
- settings.perf.doc_batch_size, # pass format_options
251
- ):
252
- _log.info(f"Going to convert document batch...")
253
-
254
- # parallel processing only within input_batch
255
- # with ThreadPoolExecutor(
256
- # max_workers=settings.perf.doc_batch_concurrency
257
- # ) as pool:
258
- # yield from pool.map(self.process_document, input_batch)
259
- # Note: PDF backends are not thread-safe, thread pool usage was disabled.
260
-
261
- for item in map(
262
- partial(self._process_document, raises_on_error=raises_on_error),
263
- input_batch,
264
- ):
265
- elapsed = time.monotonic() - start_time
266
- start_time = time.monotonic()
267
- _log.info(
268
- f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
269
- )
270
- yield item
271
-
272
- def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
273
- fopt = self.format_to_options.get(doc_format)
274
-
275
- if fopt is None:
276
- return None
277
- else:
278
- pipeline_class = fopt.pipeline_cls
279
- pipeline_options = fopt.pipeline_options
280
-
281
- if pipeline_options is None:
282
- return None
283
- # TODO this will ignore if different options have been defined for the same pipeline class.
284
- if (
285
- pipeline_class not in self.initialized_pipelines
286
- or self.initialized_pipelines[pipeline_class].pipeline_options
287
- != pipeline_options
288
- ):
289
- self.initialized_pipelines[pipeline_class] = pipeline_class(
290
- pipeline_options=pipeline_options
291
- )
292
- return self.initialized_pipelines[pipeline_class]
293
-
294
- def _process_document(
295
- self, in_doc: InputDocument, raises_on_error: bool
296
- ) -> ConversionResult:
297
-
298
- valid = (
299
- self.allowed_formats is not None and in_doc.format in self.allowed_formats
300
- )
301
- if valid:
302
- conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
303
- else:
304
- error_message = f"File format not allowed: {in_doc.file}"
305
- if raises_on_error:
306
- raise ConversionError(error_message)
307
- else:
308
- error_item = ErrorItem(
309
- component_type=DoclingComponentType.USER_INPUT,
310
- module_name="",
311
- error_message=error_message,
312
- )
313
- conv_res = ConversionResult(
314
- input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
315
- )
316
-
317
- return conv_res
318
-
319
- def _execute_pipeline(
320
- self, in_doc: InputDocument, raises_on_error: bool
321
- ) -> ConversionResult:
322
- if in_doc.valid:
323
- pipeline = self._get_pipeline(in_doc.format)
324
- if pipeline is not None:
325
- conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
326
- else:
327
- if raises_on_error:
328
- raise ConversionError(
329
- f"No pipeline could be initialized for {in_doc.file}."
330
- )
331
- else:
332
- conv_res = ConversionResult(
333
- input=in_doc,
334
- status=ConversionStatus.FAILURE,
335
- )
336
- else:
337
- if raises_on_error:
338
- raise ConversionError(f"Input document {in_doc.file} is not valid.")
339
-
340
- else:
341
- # invalid doc or not of desired format
342
- conv_res = ConversionResult(
343
- input=in_doc,
344
- status=ConversionStatus.FAILURE,
345
- )
346
- # TODO add error log why it failed.
347
-
348
- return conv_res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/exceptions.py DELETED
@@ -1,6 +0,0 @@
1
- class BaseError(RuntimeError):
2
- pass
3
-
4
-
5
- class ConversionError(BaseError):
6
- pass
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/__init__.py DELETED
File without changes
Paper2Video/src/evaluation/PresentQuiz/docling/models/base_model.py DELETED
@@ -1,87 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from typing import Any, Generic, Iterable, Optional
3
-
4
- from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
5
- from typing_extensions import TypeVar
6
-
7
- from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
8
- from docling.datamodel.document import ConversionResult
9
- from docling.datamodel.settings import settings
10
-
11
-
12
- class BasePageModel(ABC):
13
- @abstractmethod
14
- def __call__(
15
- self, conv_res: ConversionResult, page_batch: Iterable[Page]
16
- ) -> Iterable[Page]:
17
- pass
18
-
19
-
20
- EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
21
-
22
-
23
- class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
24
-
25
- elements_batch_size: int = settings.perf.elements_batch_size
26
-
27
- @abstractmethod
28
- def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
29
- pass
30
-
31
- @abstractmethod
32
- def prepare_element(
33
- self, conv_res: ConversionResult, element: NodeItem
34
- ) -> Optional[EnrichElementT]:
35
- pass
36
-
37
- @abstractmethod
38
- def __call__(
39
- self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
40
- ) -> Iterable[NodeItem]:
41
- pass
42
-
43
-
44
- class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
45
-
46
- def prepare_element(
47
- self, conv_res: ConversionResult, element: NodeItem
48
- ) -> Optional[NodeItem]:
49
- if self.is_processable(doc=conv_res.document, element=element):
50
- return element
51
- return None
52
-
53
-
54
- class BaseItemAndImageEnrichmentModel(
55
- GenericEnrichmentModel[ItemAndImageEnrichmentElement]
56
- ):
57
-
58
- images_scale: float
59
- expansion_factor: float = 0.0
60
-
61
- def prepare_element(
62
- self, conv_res: ConversionResult, element: NodeItem
63
- ) -> Optional[ItemAndImageEnrichmentElement]:
64
- if not self.is_processable(doc=conv_res.document, element=element):
65
- return None
66
-
67
- assert isinstance(element, DocItem)
68
- element_prov = element.prov[0]
69
-
70
- bbox = element_prov.bbox
71
- width = bbox.r - bbox.l
72
- height = bbox.t - bbox.b
73
-
74
- # TODO: move to a utility in the BoundingBox class
75
- expanded_bbox = BoundingBox(
76
- l=bbox.l - width * self.expansion_factor,
77
- t=bbox.t + height * self.expansion_factor,
78
- r=bbox.r + width * self.expansion_factor,
79
- b=bbox.b - height * self.expansion_factor,
80
- coord_origin=bbox.coord_origin,
81
- )
82
-
83
- page_ix = element_prov.page_no - 1
84
- cropped_image = conv_res.pages[page_ix].get_image(
85
- scale=self.images_scale, cropbox=expanded_bbox
86
- )
87
- return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/base_ocr_model.py DELETED
@@ -1,189 +0,0 @@
1
- import copy
2
- import logging
3
- from abc import abstractmethod
4
- from pathlib import Path
5
- from typing import Iterable, List
6
-
7
- import numpy as np
8
- from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from PIL import Image, ImageDraw
10
- from rtree import index
11
- from scipy.ndimage import binary_dilation, find_objects, label
12
-
13
- from docling.datamodel.base_models import Cell, OcrCell, Page
14
- from docling.datamodel.document import ConversionResult
15
- from docling.datamodel.pipeline_options import OcrOptions
16
- from docling.datamodel.settings import settings
17
- from docling.models.base_model import BasePageModel
18
-
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class BaseOcrModel(BasePageModel):
23
- def __init__(self, enabled: bool, options: OcrOptions):
24
- self.enabled = enabled
25
- self.options = options
26
-
27
- # Computes the optimum amount and coordinates of rectangles to OCR on a given page
28
- def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
29
- BITMAP_COVERAGE_TRESHOLD = 0.75
30
- assert page.size is not None
31
-
32
- def find_ocr_rects(size, bitmap_rects):
33
- image = Image.new(
34
- "1", (round(size.width), round(size.height))
35
- ) # '1' mode is binary
36
-
37
- # Draw all bitmap rects into a binary image
38
- draw = ImageDraw.Draw(image)
39
- for rect in bitmap_rects:
40
- x0, y0, x1, y1 = rect.as_tuple()
41
- x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
42
- draw.rectangle([(x0, y0), (x1, y1)], fill=1)
43
-
44
- np_image = np.array(image)
45
-
46
- # Dilate the image by 10 pixels to merge nearby bitmap rectangles
47
- structure = np.ones(
48
- (20, 20)
49
- ) # Create a 20x20 structure element (10 pixels in all directions)
50
- np_image = binary_dilation(np_image > 0, structure=structure)
51
-
52
- # Find the connected components
53
- labeled_image, num_features = label(
54
- np_image > 0
55
- ) # Label black (0 value) regions
56
-
57
- # Find enclosing bounding boxes for each connected component.
58
- slices = find_objects(labeled_image)
59
- bounding_boxes = [
60
- BoundingBox(
61
- l=slc[1].start,
62
- t=slc[0].start,
63
- r=slc[1].stop - 1,
64
- b=slc[0].stop - 1,
65
- coord_origin=CoordOrigin.TOPLEFT,
66
- )
67
- for slc in slices
68
- ]
69
-
70
- # Compute area fraction on page covered by bitmaps
71
- area_frac = np.sum(np_image > 0) / (size.width * size.height)
72
-
73
- return (area_frac, bounding_boxes) # fraction covered # boxes
74
-
75
- if page._backend is not None:
76
- bitmap_rects = page._backend.get_bitmap_rects()
77
- else:
78
- bitmap_rects = []
79
- coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
80
-
81
- # return full-page rectangle if page is dominantly covered with bitmaps
82
- if self.options.force_full_page_ocr or coverage > max(
83
- BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
84
- ):
85
- return [
86
- BoundingBox(
87
- l=0,
88
- t=0,
89
- r=page.size.width,
90
- b=page.size.height,
91
- coord_origin=CoordOrigin.TOPLEFT,
92
- )
93
- ]
94
- # return individual rectangles if the bitmap coverage is above the threshold
95
- elif coverage > self.options.bitmap_area_threshold:
96
- return ocr_rects
97
- else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
98
- return []
99
-
100
- # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
101
- def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
102
- # Create R-tree index for programmatic cells
103
- p = index.Property()
104
- p.dimension = 2
105
- idx = index.Index(properties=p)
106
- for i, cell in enumerate(programmatic_cells):
107
- idx.insert(i, cell.bbox.as_tuple())
108
-
109
- def is_overlapping_with_existing_cells(ocr_cell):
110
- # Query the R-tree to get overlapping rectangles
111
- possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
112
-
113
- return (
114
- len(possible_matches_index) > 0
115
- ) # this is a weak criterion but it works.
116
-
117
- filtered_ocr_cells = [
118
- rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
119
- ]
120
- return filtered_ocr_cells
121
-
122
- def post_process_cells(self, ocr_cells, programmatic_cells):
123
- r"""
124
- Post-process the ocr and programmatic cells and return the final list of of cells
125
- """
126
- if self.options.force_full_page_ocr:
127
- # If a full page OCR is forced, use only the OCR cells
128
- cells = [
129
- Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
130
- for c_ocr in ocr_cells
131
- ]
132
- return cells
133
-
134
- ## Remove OCR cells which overlap with programmatic cells.
135
- filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
136
- programmatic_cells.extend(filtered_ocr_cells)
137
- return programmatic_cells
138
-
139
- def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
140
- image = copy.deepcopy(page.image)
141
- scale_x = image.width / page.size.width
142
- scale_y = image.height / page.size.height
143
-
144
- draw = ImageDraw.Draw(image, "RGBA")
145
-
146
- # Draw OCR rectangles as yellow filled rect
147
- for rect in ocr_rects:
148
- x0, y0, x1, y1 = rect.as_tuple()
149
- y0 *= scale_x
150
- y1 *= scale_y
151
- x0 *= scale_x
152
- x1 *= scale_x
153
-
154
- shade_color = (255, 255, 0, 40) # transparent yellow
155
- draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
156
-
157
- # Draw OCR and programmatic cells
158
- for tc in page.cells:
159
- x0, y0, x1, y1 = tc.bbox.as_tuple()
160
- y0 *= scale_x
161
- y1 *= scale_y
162
- x0 *= scale_x
163
- x1 *= scale_x
164
-
165
- if y1 <= y0:
166
- y1, y0 = y0, y1
167
-
168
- color = "gray"
169
- if isinstance(tc, OcrCell):
170
- color = "magenta"
171
- draw.rectangle([(x0, y0), (x1, y1)], outline=color)
172
-
173
- if show:
174
- image.show()
175
- else:
176
- out_path: Path = (
177
- Path(settings.debug.debug_output_path)
178
- / f"debug_{conv_res.input.file.stem}"
179
- )
180
- out_path.mkdir(parents=True, exist_ok=True)
181
-
182
- out_file = out_path / f"ocr_page_{page.page_no:05}.png"
183
- image.save(str(out_file), format="png")
184
-
185
- @abstractmethod
186
- def __call__(
187
- self, conv_res: ConversionResult, page_batch: Iterable[Page]
188
- ) -> Iterable[Page]:
189
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/code_formula_model.py DELETED
@@ -1,251 +0,0 @@
1
- import re
2
- from pathlib import Path
3
- from typing import Iterable, List, Literal, Optional, Tuple, Union
4
-
5
- import numpy as np
6
- from docling_core.types.doc import (
7
- CodeItem,
8
- DocItemLabel,
9
- DoclingDocument,
10
- NodeItem,
11
- TextItem,
12
- )
13
- from docling_core.types.doc.labels import CodeLanguageLabel
14
- from PIL import Image
15
- from pydantic import BaseModel
16
-
17
- from docling.datamodel.base_models import ItemAndImageEnrichmentElement
18
- from docling.datamodel.pipeline_options import AcceleratorOptions
19
- from docling.models.base_model import BaseItemAndImageEnrichmentModel
20
- from docling.utils.accelerator_utils import decide_device
21
-
22
-
23
- class CodeFormulaModelOptions(BaseModel):
24
- """
25
- Configuration options for the CodeFormulaModel.
26
-
27
- Attributes
28
- ----------
29
- kind : str
30
- Type of the model. Fixed value "code_formula".
31
- do_code_enrichment : bool
32
- True if code enrichment is enabled, False otherwise.
33
- do_formula_enrichment : bool
34
- True if formula enrichment is enabled, False otherwise.
35
- """
36
-
37
- kind: Literal["code_formula"] = "code_formula"
38
- do_code_enrichment: bool = True
39
- do_formula_enrichment: bool = True
40
-
41
-
42
- class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
43
- """
44
- Model for processing and enriching documents with code and formula predictions.
45
-
46
- Attributes
47
- ----------
48
- enabled : bool
49
- True if the model is enabled, False otherwise.
50
- options : CodeFormulaModelOptions
51
- Configuration options for the CodeFormulaModel.
52
- code_formula_model : CodeFormulaPredictor
53
- The predictor model for code and formula processing.
54
-
55
- Methods
56
- -------
57
- __init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
58
- Initializes the CodeFormulaModel with the given configuration options.
59
- is_processable(self, doc, element)
60
- Determines if a given element in a document can be processed by the model.
61
- __call__(self, doc, element_batch)
62
- Processes the given batch of elements and enriches them with predictions.
63
- """
64
-
65
- _model_repo_folder = "ds4sd--CodeFormula"
66
- elements_batch_size = 5
67
- images_scale = 1.66 # = 120 dpi, aligned with training data resolution
68
- expansion_factor = 0.03
69
-
70
- def __init__(
71
- self,
72
- enabled: bool,
73
- artifacts_path: Optional[Path],
74
- options: CodeFormulaModelOptions,
75
- accelerator_options: AcceleratorOptions,
76
- ):
77
- """
78
- Initializes the CodeFormulaModel with the given configuration.
79
-
80
- Parameters
81
- ----------
82
- enabled : bool
83
- True if the model is enabled, False otherwise.
84
- artifacts_path : Path
85
- Path to the directory containing the model artifacts.
86
- options : CodeFormulaModelOptions
87
- Configuration options for the model.
88
- accelerator_options : AcceleratorOptions
89
- Options specifying the device and number of threads for acceleration.
90
- """
91
- self.enabled = enabled
92
- self.options = options
93
-
94
- if self.enabled:
95
- device = decide_device(accelerator_options.device)
96
-
97
- from docling_ibm_models.code_formula_model.code_formula_predictor import (
98
- CodeFormulaPredictor,
99
- )
100
-
101
- if artifacts_path is None:
102
- artifacts_path = self.download_models()
103
- else:
104
- artifacts_path = artifacts_path / self._model_repo_folder
105
-
106
- self.code_formula_model = CodeFormulaPredictor(
107
- artifacts_path=str(artifacts_path),
108
- device=device,
109
- num_threads=accelerator_options.num_threads,
110
- )
111
-
112
- @staticmethod
113
- def download_models(
114
- local_dir: Optional[Path] = None,
115
- force: bool = False,
116
- progress: bool = False,
117
- ) -> Path:
118
- from huggingface_hub import snapshot_download
119
- from huggingface_hub.utils import disable_progress_bars
120
-
121
- if not progress:
122
- disable_progress_bars()
123
- download_path = snapshot_download(
124
- repo_id="ds4sd/CodeFormula",
125
- force_download=force,
126
- local_dir=local_dir,
127
- revision="v1.0.1",
128
- )
129
-
130
- return Path(download_path)
131
-
132
- def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
133
- """
134
- Determines if a given element in a document can be processed by the model.
135
-
136
- Parameters
137
- ----------
138
- doc : DoclingDocument
139
- The document being processed.
140
- element : NodeItem
141
- The element within the document to check.
142
-
143
- Returns
144
- -------
145
- bool
146
- True if the element can be processed, False otherwise.
147
- """
148
- return self.enabled and (
149
- (isinstance(element, CodeItem) and self.options.do_code_enrichment)
150
- or (
151
- isinstance(element, TextItem)
152
- and element.label == DocItemLabel.FORMULA
153
- and self.options.do_formula_enrichment
154
- )
155
- )
156
-
157
- def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
158
- """Extracts a programming language from the beginning of a string.
159
-
160
- This function checks if the input string starts with a pattern of the form
161
- ``<_some_language_>``. If it does, it extracts the language string and returns
162
- a tuple of (remainder, language). Otherwise, it returns the original string
163
- and `None`.
164
-
165
- Args:
166
- input_string (str): The input string, which may start with ``<_language_>``.
167
-
168
- Returns:
169
- Tuple[str, Optional[str]]:
170
- A tuple where:
171
- - The first element is either:
172
- - The remainder of the string (everything after ``<_language_>``),
173
- if a match is found; or
174
- - The original string, if no match is found.
175
- - The second element is the extracted language if a match is found;
176
- otherwise, `None`.
177
- """
178
- pattern = r"^<_([^>]+)_>\s*(.*)"
179
- match = re.match(pattern, input_string, flags=re.DOTALL)
180
- if match:
181
- language = str(match.group(1)) # the captured programming language
182
- remainder = str(match.group(2)) # everything after the <_language_>
183
- return remainder, language
184
- else:
185
- return input_string, None
186
-
187
- def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
188
- """
189
- Converts a string to a corresponding `CodeLanguageLabel` enum member.
190
-
191
- If the provided string does not match any value in `CodeLanguageLabel`,
192
- it defaults to `CodeLanguageLabel.UNKNOWN`.
193
-
194
- Args:
195
- value (Optional[str]): The string representation of the code language or None.
196
-
197
- Returns:
198
- CodeLanguageLabel: The corresponding enum member if the value is valid,
199
- otherwise `CodeLanguageLabel.UNKNOWN`.
200
- """
201
- if not isinstance(value, str):
202
- return CodeLanguageLabel.UNKNOWN
203
-
204
- try:
205
- return CodeLanguageLabel(value)
206
- except ValueError:
207
- return CodeLanguageLabel.UNKNOWN
208
-
209
- def __call__(
210
- self,
211
- doc: DoclingDocument,
212
- element_batch: Iterable[ItemAndImageEnrichmentElement],
213
- ) -> Iterable[NodeItem]:
214
- """
215
- Processes the given batch of elements and enriches them with predictions.
216
-
217
- Parameters
218
- ----------
219
- doc : DoclingDocument
220
- The document being processed.
221
- element_batch : Iterable[ItemAndImageEnrichmentElement]
222
- A batch of elements to be processed.
223
-
224
- Returns
225
- -------
226
- Iterable[Any]
227
- An iterable of enriched elements.
228
- """
229
- if not self.enabled:
230
- for element in element_batch:
231
- yield element.item
232
- return
233
-
234
- labels: List[str] = []
235
- images: List[Union[Image.Image, np.ndarray]] = []
236
- elements: List[TextItem] = []
237
- for el in element_batch:
238
- assert isinstance(el.item, TextItem)
239
- elements.append(el.item)
240
- labels.append(el.item.label)
241
- images.append(el.image)
242
-
243
- outputs = self.code_formula_model.predict(images, labels)
244
-
245
- for item, output in zip(elements, outputs):
246
- if isinstance(item, CodeItem):
247
- output, code_language = self._extract_code_language(output)
248
- item.code_language = self._get_code_language_enum(code_language)
249
- item.text = output
250
-
251
- yield item
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/document_picture_classifier.py DELETED
@@ -1,190 +0,0 @@
1
- from pathlib import Path
2
- from typing import Iterable, List, Literal, Optional, Tuple, Union
3
-
4
- import numpy as np
5
- from docling_core.types.doc import (
6
- DoclingDocument,
7
- NodeItem,
8
- PictureClassificationClass,
9
- PictureClassificationData,
10
- PictureItem,
11
- )
12
- from PIL import Image
13
- from pydantic import BaseModel
14
-
15
- from docling.datamodel.pipeline_options import AcceleratorOptions
16
- from docling.models.base_model import BaseEnrichmentModel
17
- from docling.utils.accelerator_utils import decide_device
18
-
19
-
20
- class DocumentPictureClassifierOptions(BaseModel):
21
- """
22
- Options for configuring the DocumentPictureClassifier.
23
-
24
- Attributes
25
- ----------
26
- kind : Literal["document_picture_classifier"]
27
- Identifier for the type of classifier.
28
- """
29
-
30
- kind: Literal["document_picture_classifier"] = "document_picture_classifier"
31
-
32
-
33
- class DocumentPictureClassifier(BaseEnrichmentModel):
34
- """
35
- A model for classifying pictures in documents.
36
-
37
- This class enriches document pictures with predicted classifications
38
- based on a predefined set of classes.
39
-
40
- Attributes
41
- ----------
42
- enabled : bool
43
- Whether the classifier is enabled for use.
44
- options : DocumentPictureClassifierOptions
45
- Configuration options for the classifier.
46
- document_picture_classifier : DocumentPictureClassifierPredictor
47
- The underlying prediction model, loaded if the classifier is enabled.
48
-
49
- Methods
50
- -------
51
- __init__(enabled, artifacts_path, options, accelerator_options)
52
- Initializes the classifier with specified configurations.
53
- is_processable(doc, element)
54
- Checks if the given element can be processed by the classifier.
55
- __call__(doc, element_batch)
56
- Processes a batch of elements and adds classification annotations.
57
- """
58
-
59
- _model_repo_folder = "ds4sd--DocumentFigureClassifier"
60
- images_scale = 2
61
-
62
- def __init__(
63
- self,
64
- enabled: bool,
65
- artifacts_path: Optional[Path],
66
- options: DocumentPictureClassifierOptions,
67
- accelerator_options: AcceleratorOptions,
68
- ):
69
- """
70
- Initializes the DocumentPictureClassifier.
71
-
72
- Parameters
73
- ----------
74
- enabled : bool
75
- Indicates whether the classifier is enabled.
76
- artifacts_path : Optional[Union[Path, str]],
77
- Path to the directory containing model artifacts.
78
- options : DocumentPictureClassifierOptions
79
- Configuration options for the classifier.
80
- accelerator_options : AcceleratorOptions
81
- Options for configuring the device and parallelism.
82
- """
83
- self.enabled = enabled
84
- self.options = options
85
-
86
- if self.enabled:
87
- device = decide_device(accelerator_options.device)
88
- from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
89
- DocumentFigureClassifierPredictor,
90
- )
91
-
92
- if artifacts_path is None:
93
- artifacts_path = self.download_models()
94
- else:
95
- artifacts_path = artifacts_path / self._model_repo_folder
96
-
97
- self.document_picture_classifier = DocumentFigureClassifierPredictor(
98
- artifacts_path=str(artifacts_path),
99
- device=device,
100
- num_threads=accelerator_options.num_threads,
101
- )
102
-
103
- @staticmethod
104
- def download_models(
105
- local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
106
- ) -> Path:
107
- from huggingface_hub import snapshot_download
108
- from huggingface_hub.utils import disable_progress_bars
109
-
110
- if not progress:
111
- disable_progress_bars()
112
- download_path = snapshot_download(
113
- repo_id="ds4sd/DocumentFigureClassifier",
114
- force_download=force,
115
- local_dir=local_dir,
116
- revision="v1.0.0",
117
- )
118
-
119
- return Path(download_path)
120
-
121
- def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
122
- """
123
- Determines if the given element can be processed by the classifier.
124
-
125
- Parameters
126
- ----------
127
- doc : DoclingDocument
128
- The document containing the element.
129
- element : NodeItem
130
- The element to be checked.
131
-
132
- Returns
133
- -------
134
- bool
135
- True if the element is a PictureItem and processing is enabled; False otherwise.
136
- """
137
- return self.enabled and isinstance(element, PictureItem)
138
-
139
- def __call__(
140
- self,
141
- doc: DoclingDocument,
142
- element_batch: Iterable[NodeItem],
143
- ) -> Iterable[NodeItem]:
144
- """
145
- Processes a batch of elements and enriches them with classification predictions.
146
-
147
- Parameters
148
- ----------
149
- doc : DoclingDocument
150
- The document containing the elements to be processed.
151
- element_batch : Iterable[NodeItem]
152
- A batch of pictures to classify.
153
-
154
- Returns
155
- -------
156
- Iterable[NodeItem]
157
- An iterable of NodeItem objects after processing. The field
158
- 'data.classification' is added containing the classification for each picture.
159
- """
160
- if not self.enabled:
161
- for element in element_batch:
162
- yield element
163
- return
164
-
165
- images: List[Union[Image.Image, np.ndarray]] = []
166
- elements: List[PictureItem] = []
167
- for el in element_batch:
168
- assert isinstance(el, PictureItem)
169
- elements.append(el)
170
- img = el.get_image(doc)
171
- assert img is not None
172
- images.append(img)
173
-
174
- outputs = self.document_picture_classifier.predict(images)
175
-
176
- for element, output in zip(elements, outputs):
177
- element.annotations.append(
178
- PictureClassificationData(
179
- provenance="DocumentPictureClassifier",
180
- predicted_classes=[
181
- PictureClassificationClass(
182
- class_name=pred[0],
183
- confidence=pred[1],
184
- )
185
- for pred in output
186
- ],
187
- )
188
- )
189
-
190
- yield element
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/ds_glm_model.py DELETED
@@ -1,386 +0,0 @@
1
- import copy
2
- import random
3
- from pathlib import Path
4
- from typing import List, Union
5
-
6
- from deepsearch_glm.andromeda_nlp import nlp_model
7
- from docling_core.types.doc import (
8
- BoundingBox,
9
- CoordOrigin,
10
- DocItemLabel,
11
- DoclingDocument,
12
- )
13
- from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
14
- from docling_core.types.legacy_doc.base import (
15
- Figure,
16
- PageDimensions,
17
- PageReference,
18
- Prov,
19
- Ref,
20
- )
21
- from docling_core.types.legacy_doc.base import Table as DsSchemaTable
22
- from docling_core.types.legacy_doc.base import TableCell
23
- from docling_core.types.legacy_doc.document import BaseText
24
- from docling_core.types.legacy_doc.document import (
25
- CCSDocumentDescription as DsDocumentDescription,
26
- )
27
- from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
28
- from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
29
- from PIL import ImageDraw
30
- from pydantic import BaseModel, ConfigDict, TypeAdapter
31
-
32
- from docling.datamodel.base_models import (
33
- Cluster,
34
- ContainerElement,
35
- FigureElement,
36
- Table,
37
- TextElement,
38
- )
39
- from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
40
- from docling.datamodel.settings import settings
41
- from docling.utils.glm_utils import to_docling_document
42
- from docling.utils.profiling import ProfilingScope, TimeRecorder
43
- from docling.utils.utils import create_hash
44
-
45
-
46
- class GlmOptions(BaseModel):
47
- model_config = ConfigDict(protected_namespaces=())
48
-
49
- model_names: str = "" # e.g. "language;term;reference"
50
-
51
-
52
- class GlmModel:
53
- def __init__(self, options: GlmOptions):
54
- self.options = options
55
-
56
- self.model = nlp_model(loglevel="error", text_ordering=True)
57
-
58
- def _to_legacy_document(self, conv_res) -> DsDocument:
59
- title = ""
60
- desc: DsDocumentDescription = DsDocumentDescription(logs=[])
61
-
62
- page_hashes = [
63
- PageReference(
64
- hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
65
- page=p.page_no + 1,
66
- model="default",
67
- )
68
- for p in conv_res.pages
69
- ]
70
-
71
- file_info = DsFileInfoObject(
72
- filename=conv_res.input.file.name,
73
- document_hash=conv_res.input.document_hash,
74
- num_pages=conv_res.input.page_count,
75
- page_hashes=page_hashes,
76
- )
77
-
78
- main_text: List[Union[Ref, BaseText]] = []
79
- page_headers: List[Union[Ref, BaseText]] = []
80
- page_footers: List[Union[Ref, BaseText]] = []
81
-
82
- tables: List[DsSchemaTable] = []
83
- figures: List[Figure] = []
84
-
85
- page_no_to_page = {p.page_no: p for p in conv_res.pages}
86
-
87
- for element in conv_res.assembled.body:
88
- # Convert bboxes to lower-left origin.
89
- target_bbox = DsBoundingBox(
90
- element.cluster.bbox.to_bottom_left_origin(
91
- page_no_to_page[element.page_no].size.height
92
- ).as_tuple()
93
- )
94
-
95
- if isinstance(element, TextElement):
96
- main_text.append(
97
- BaseText(
98
- text=element.text,
99
- obj_type=layout_label_to_ds_type.get(element.label),
100
- name=element.label,
101
- prov=[
102
- Prov(
103
- bbox=target_bbox,
104
- page=element.page_no + 1,
105
- span=[0, len(element.text)],
106
- )
107
- ],
108
- )
109
- )
110
- elif isinstance(element, Table):
111
- index = len(tables)
112
- ref_str = f"#/tables/{index}"
113
- main_text.append(
114
- Ref(
115
- name=element.label,
116
- obj_type=layout_label_to_ds_type.get(element.label),
117
- ref=ref_str,
118
- ),
119
- )
120
-
121
- # Initialise empty table data grid (only empty cells)
122
- table_data = [
123
- [
124
- TableCell(
125
- text="",
126
- # bbox=[0,0,0,0],
127
- spans=[[i, j]],
128
- obj_type="body",
129
- )
130
- for j in range(element.num_cols)
131
- ]
132
- for i in range(element.num_rows)
133
- ]
134
-
135
- # Overwrite cells in table data for which there is actual cell content.
136
- for cell in element.table_cells:
137
- for i in range(
138
- min(cell.start_row_offset_idx, element.num_rows),
139
- min(cell.end_row_offset_idx, element.num_rows),
140
- ):
141
- for j in range(
142
- min(cell.start_col_offset_idx, element.num_cols),
143
- min(cell.end_col_offset_idx, element.num_cols),
144
- ):
145
- celltype = "body"
146
- if cell.column_header:
147
- celltype = "col_header"
148
- elif cell.row_header:
149
- celltype = "row_header"
150
- elif cell.row_section:
151
- celltype = "row_section"
152
-
153
- def make_spans(cell):
154
- for rspan in range(
155
- min(cell.start_row_offset_idx, element.num_rows),
156
- min(cell.end_row_offset_idx, element.num_rows),
157
- ):
158
- for cspan in range(
159
- min(
160
- cell.start_col_offset_idx, element.num_cols
161
- ),
162
- min(cell.end_col_offset_idx, element.num_cols),
163
- ):
164
- yield [rspan, cspan]
165
-
166
- spans = list(make_spans(cell))
167
- if cell.bbox is not None:
168
- bbox = cell.bbox.to_bottom_left_origin(
169
- page_no_to_page[element.page_no].size.height
170
- ).as_tuple()
171
- else:
172
- bbox = None
173
-
174
- table_data[i][j] = TableCell(
175
- text=cell.text,
176
- bbox=bbox,
177
- # col=j,
178
- # row=i,
179
- spans=spans,
180
- obj_type=celltype,
181
- # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
182
- # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
183
- )
184
-
185
- tables.append(
186
- DsSchemaTable(
187
- num_cols=element.num_cols,
188
- num_rows=element.num_rows,
189
- obj_type=layout_label_to_ds_type.get(element.label),
190
- data=table_data,
191
- prov=[
192
- Prov(
193
- bbox=target_bbox,
194
- page=element.page_no + 1,
195
- span=[0, 0],
196
- )
197
- ],
198
- )
199
- )
200
-
201
- elif isinstance(element, FigureElement):
202
- index = len(figures)
203
- ref_str = f"#/figures/{index}"
204
- main_text.append(
205
- Ref(
206
- name=element.label,
207
- obj_type=layout_label_to_ds_type.get(element.label),
208
- ref=ref_str,
209
- ),
210
- )
211
- figures.append(
212
- Figure(
213
- prov=[
214
- Prov(
215
- bbox=target_bbox,
216
- page=element.page_no + 1,
217
- span=[0, 0],
218
- )
219
- ],
220
- obj_type=layout_label_to_ds_type.get(element.label),
221
- payload={
222
- "children": TypeAdapter(List[Cluster]).dump_python(
223
- element.cluster.children
224
- )
225
- }, # hack to channel child clusters through GLM
226
- )
227
- )
228
- elif isinstance(element, ContainerElement):
229
- main_text.append(
230
- BaseText(
231
- text="",
232
- payload={
233
- "children": TypeAdapter(List[Cluster]).dump_python(
234
- element.cluster.children
235
- )
236
- }, # hack to channel child clusters through GLM
237
- obj_type=layout_label_to_ds_type.get(element.label),
238
- name=element.label,
239
- prov=[
240
- Prov(
241
- bbox=target_bbox,
242
- page=element.page_no + 1,
243
- span=[0, 0],
244
- )
245
- ],
246
- )
247
- )
248
-
249
- # We can throw in headers and footers at the end of the legacy doc
250
- # since the reading-order will re-sort it later.
251
- for element in conv_res.assembled.headers:
252
- # Convert bboxes to lower-left origin.
253
- target_bbox = DsBoundingBox(
254
- element.cluster.bbox.to_bottom_left_origin(
255
- page_no_to_page[element.page_no].size.height
256
- ).as_tuple()
257
- )
258
-
259
- if isinstance(element, TextElement):
260
-
261
- tel = BaseText(
262
- text=element.text,
263
- obj_type=layout_label_to_ds_type.get(element.label),
264
- name=element.label,
265
- prov=[
266
- Prov(
267
- bbox=target_bbox,
268
- page=element.page_no + 1,
269
- span=[0, len(element.text)],
270
- )
271
- ],
272
- )
273
- if element.label == DocItemLabel.PAGE_HEADER:
274
- index = len(page_headers)
275
- ref_str = f"#/page-headers/{index}"
276
- main_text.append(
277
- Ref(
278
- name=element.label,
279
- obj_type=layout_label_to_ds_type.get(element.label),
280
- ref=ref_str,
281
- ),
282
- )
283
- page_headers.append(tel)
284
- elif element.label == DocItemLabel.PAGE_FOOTER:
285
- index = len(page_footers)
286
- ref_str = f"#/page-footers/{index}"
287
- main_text.append(
288
- Ref(
289
- name=element.label,
290
- obj_type=layout_label_to_ds_type.get(element.label),
291
- ref=ref_str,
292
- ),
293
- )
294
- page_footers.append(tel)
295
-
296
- page_dimensions = [
297
- PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
298
- for p in conv_res.pages
299
- if p.size is not None
300
- ]
301
-
302
- ds_doc: DsDocument = DsDocument(
303
- name=title,
304
- description=desc,
305
- file_info=file_info,
306
- main_text=main_text,
307
- tables=tables,
308
- figures=figures,
309
- page_dimensions=page_dimensions,
310
- page_headers=page_headers,
311
- page_footers=page_footers,
312
- )
313
-
314
- return ds_doc
315
-
316
- def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
317
- with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
318
- ds_doc = self._to_legacy_document(conv_res)
319
- ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
320
-
321
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
322
-
323
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
324
- 1 == 1
325
-
326
- # DEBUG code:
327
- def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
328
- clusters_to_draw = []
329
- image = copy.deepcopy(conv_res.pages[page_no].image)
330
- for ix, elem in enumerate(ds_document.main_text):
331
- if isinstance(elem, BaseText):
332
- prov = elem.prov[0] # type: ignore
333
- elif isinstance(elem, Ref):
334
- _, arr, index = elem.ref.split("/")
335
- index = int(index) # type: ignore
336
- if arr == "tables":
337
- prov = ds_document.tables[index].prov[0]
338
- elif arr == "figures":
339
- prov = ds_document.pictures[index].prov[0]
340
- else:
341
- prov = None
342
-
343
- if prov and prov.page == page_no:
344
- clusters_to_draw.append(
345
- Cluster(
346
- id=ix,
347
- label=elem.name,
348
- bbox=BoundingBox.from_tuple(
349
- coord=prov.bbox, # type: ignore
350
- origin=CoordOrigin.BOTTOMLEFT,
351
- ).to_top_left_origin(conv_res.pages[page_no].size.height),
352
- )
353
- )
354
-
355
- draw = ImageDraw.Draw(image)
356
- for c in clusters_to_draw:
357
- x0, y0, x1, y1 = c.bbox.as_tuple()
358
- draw.rectangle([(x0, y0), (x1, y1)], outline="red")
359
- draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
360
-
361
- cell_color = (
362
- random.randint(30, 140),
363
- random.randint(30, 140),
364
- random.randint(30, 140),
365
- )
366
- for tc in c.cells: # [:1]:
367
- x0, y0, x1, y1 = tc.bbox.as_tuple()
368
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
369
-
370
- if show:
371
- image.show()
372
- else:
373
- out_path: Path = (
374
- Path(settings.debug.debug_output_path)
375
- / f"debug_{conv_res.input.file.stem}"
376
- )
377
- out_path.mkdir(parents=True, exist_ok=True)
378
-
379
- out_file = out_path / f"doc_page_{page_no:05}.png"
380
- image.save(str(out_file), format="png")
381
-
382
- # for item in ds_doc.page_dimensions:
383
- # page_no = item.page
384
- # draw_clusters_and_cells(ds_doc, page_no)
385
-
386
- return docling_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/easyocr_model.py DELETED
@@ -1,177 +0,0 @@
1
- import logging
2
- import warnings
3
- import zipfile
4
- from pathlib import Path
5
- from typing import Iterable, List, Optional
6
-
7
- import numpy
8
- from docling_core.types.doc import BoundingBox, CoordOrigin
9
-
10
- from docling.datamodel.base_models import Cell, OcrCell, Page
11
- from docling.datamodel.document import ConversionResult
12
- from docling.datamodel.pipeline_options import (
13
- AcceleratorDevice,
14
- AcceleratorOptions,
15
- EasyOcrOptions,
16
- )
17
- from docling.datamodel.settings import settings
18
- from docling.models.base_ocr_model import BaseOcrModel
19
- from docling.utils.accelerator_utils import decide_device
20
- from docling.utils.profiling import TimeRecorder
21
- from docling.utils.utils import download_url_with_progress
22
-
23
- _log = logging.getLogger(__name__)
24
-
25
-
26
- class EasyOcrModel(BaseOcrModel):
27
- _model_repo_folder = "EasyOcr"
28
-
29
- def __init__(
30
- self,
31
- enabled: bool,
32
- artifacts_path: Optional[Path],
33
- options: EasyOcrOptions,
34
- accelerator_options: AcceleratorOptions,
35
- ):
36
- super().__init__(enabled=enabled, options=options)
37
- self.options: EasyOcrOptions
38
-
39
- self.scale = 3 # multiplier for 72 dpi == 216 dpi.
40
-
41
- if self.enabled:
42
- try:
43
- import easyocr
44
- except ImportError:
45
- raise ImportError(
46
- "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
47
- "Alternatively, Docling has support for other OCR engines. See the documentation."
48
- )
49
-
50
- if self.options.use_gpu is None:
51
- device = decide_device(accelerator_options.device)
52
- # Enable easyocr GPU if running on CUDA, MPS
53
- use_gpu = any(
54
- [
55
- device.startswith(x)
56
- for x in [
57
- AcceleratorDevice.CUDA.value,
58
- AcceleratorDevice.MPS.value,
59
- ]
60
- ]
61
- )
62
- else:
63
- warnings.warn(
64
- "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
65
- "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
66
- "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
67
- )
68
- use_gpu = self.options.use_gpu
69
-
70
- download_enabled = self.options.download_enabled
71
- model_storage_directory = self.options.model_storage_directory
72
- if artifacts_path is not None and model_storage_directory is None:
73
- download_enabled = False
74
- model_storage_directory = str(artifacts_path / self._model_repo_folder)
75
-
76
- self.reader = easyocr.Reader(
77
- lang_list=self.options.lang,
78
- gpu=use_gpu,
79
- model_storage_directory=model_storage_directory,
80
- recog_network=self.options.recog_network,
81
- download_enabled=download_enabled,
82
- verbose=False,
83
- )
84
-
85
- @staticmethod
86
- def download_models(
87
- detection_models: List[str] = ["craft"],
88
- recognition_models: List[str] = ["english_g2", "latin_g2"],
89
- local_dir: Optional[Path] = None,
90
- force: bool = False,
91
- progress: bool = False,
92
- ) -> Path:
93
- # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
94
- from easyocr.config import detection_models as det_models_dict
95
- from easyocr.config import recognition_models as rec_models_dict
96
-
97
- if local_dir is None:
98
- local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
99
-
100
- local_dir.mkdir(parents=True, exist_ok=True)
101
-
102
- # Collect models to download
103
- download_list = []
104
- for model_name in detection_models:
105
- if model_name in det_models_dict:
106
- download_list.append(det_models_dict[model_name])
107
- for model_name in recognition_models:
108
- if model_name in rec_models_dict["gen2"]:
109
- download_list.append(rec_models_dict["gen2"][model_name])
110
-
111
- # Download models
112
- for model_details in download_list:
113
- buf = download_url_with_progress(model_details["url"], progress=progress)
114
- with zipfile.ZipFile(buf, "r") as zip_ref:
115
- zip_ref.extractall(local_dir)
116
-
117
- return local_dir
118
-
119
- def __call__(
120
- self, conv_res: ConversionResult, page_batch: Iterable[Page]
121
- ) -> Iterable[Page]:
122
-
123
- if not self.enabled:
124
- yield from page_batch
125
- return
126
-
127
- for page in page_batch:
128
-
129
- assert page._backend is not None
130
- if not page._backend.is_valid():
131
- yield page
132
- else:
133
- with TimeRecorder(conv_res, "ocr"):
134
- ocr_rects = self.get_ocr_rects(page)
135
-
136
- all_ocr_cells = []
137
- for ocr_rect in ocr_rects:
138
- # Skip zero area boxes
139
- if ocr_rect.area() == 0:
140
- continue
141
- high_res_image = page._backend.get_page_image(
142
- scale=self.scale, cropbox=ocr_rect
143
- )
144
- im = numpy.array(high_res_image)
145
- result = self.reader.readtext(im)
146
-
147
- del high_res_image
148
- del im
149
-
150
- cells = [
151
- OcrCell(
152
- id=ix,
153
- text=line[1],
154
- confidence=line[2],
155
- bbox=BoundingBox.from_tuple(
156
- coord=(
157
- (line[0][0][0] / self.scale) + ocr_rect.l,
158
- (line[0][0][1] / self.scale) + ocr_rect.t,
159
- (line[0][2][0] / self.scale) + ocr_rect.l,
160
- (line[0][2][1] / self.scale) + ocr_rect.t,
161
- ),
162
- origin=CoordOrigin.TOPLEFT,
163
- ),
164
- )
165
- for ix, line in enumerate(result)
166
- if line[2] >= self.options.confidence_threshold
167
- ]
168
- all_ocr_cells.extend(cells)
169
-
170
- # Post-process the cells
171
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
172
-
173
- # DEBUG code:
174
- if settings.debug.visualize_ocr:
175
- self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
176
-
177
- yield page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Paper2Video/src/evaluation/PresentQuiz/docling/models/layout_model.py DELETED
@@ -1,197 +0,0 @@
1
- import copy
2
- import logging
3
- import warnings
4
- from pathlib import Path
5
- from typing import Iterable, Optional, Union
6
-
7
- from docling_core.types.doc import DocItemLabel
8
- from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
9
- from PIL import Image
10
-
11
- from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
12
- from docling.datamodel.document import ConversionResult
13
- from docling.datamodel.pipeline_options import AcceleratorOptions
14
- from docling.datamodel.settings import settings
15
- from docling.models.base_model import BasePageModel
16
- from docling.utils.accelerator_utils import decide_device
17
- from docling.utils.layout_postprocessor import LayoutPostprocessor
18
- from docling.utils.profiling import TimeRecorder
19
- from docling.utils.visualization import draw_clusters
20
-
21
- _log = logging.getLogger(__name__)
22
-
23
-
24
- class LayoutModel(BasePageModel):
25
- _model_repo_folder = "ds4sd--docling-models"
26
- _model_path = "model_artifacts/layout"
27
-
28
- TEXT_ELEM_LABELS = [
29
- DocItemLabel.TEXT,
30
- DocItemLabel.FOOTNOTE,
31
- DocItemLabel.CAPTION,
32
- DocItemLabel.CHECKBOX_UNSELECTED,
33
- DocItemLabel.CHECKBOX_SELECTED,
34
- DocItemLabel.SECTION_HEADER,
35
- DocItemLabel.PAGE_HEADER,
36
- DocItemLabel.PAGE_FOOTER,
37
- DocItemLabel.CODE,
38
- DocItemLabel.LIST_ITEM,
39
- DocItemLabel.FORMULA,
40
- ]
41
- PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
42
-
43
- TABLE_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
44
- FIGURE_LABEL = DocItemLabel.PICTURE
45
- FORMULA_LABEL = DocItemLabel.FORMULA
46
- CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
47
-
48
- def __init__(
49
- self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
50
- ):
51
- device = decide_device(accelerator_options.device)
52
-
53
- if artifacts_path is None:
54
- artifacts_path = self.download_models() / self._model_path
55
- else:
56
- # will become the default in the future
57
- if (artifacts_path / self._model_repo_folder).exists():
58
- artifacts_path = (
59
- artifacts_path / self._model_repo_folder / self._model_path
60
- )
61
- elif (artifacts_path / self._model_path).exists():
62
- warnings.warn(
63
- "The usage of artifacts_path containing directly "
64
- f"{self._model_path} is deprecated. Please point "
65
- "the artifacts_path to the parent containing "
66
- f"the {self._model_repo_folder} folder.",
67
- DeprecationWarning,
68
- stacklevel=3,
69
- )
70
- artifacts_path = artifacts_path / self._model_path
71
-
72
- self.layout_predictor = LayoutPredictor(
73
- artifact_path=str(artifacts_path),
74
- device=device,
75
- num_threads=accelerator_options.num_threads,
76
- )
77
-
78
- @staticmethod
79
- def download_models(
80
- local_dir: Optional[Path] = None,
81
- force: bool = False,
82
- progress: bool = False,
83
- ) -> Path:
84
- from huggingface_hub import snapshot_download
85
- from huggingface_hub.utils import disable_progress_bars
86
-
87
- if not progress:
88
- disable_progress_bars()
89
- download_path = snapshot_download(
90
- repo_id="ds4sd/docling-models",
91
- force_download=force,
92
- local_dir=local_dir,
93
- revision="v2.1.0",
94
- )
95
-
96
- return Path(download_path)
97
-
98
- def draw_clusters_and_cells_side_by_side(
99
- self, conv_res, page, clusters, mode_prefix: str, show: bool = False
100
- ):
101
- """
102
- Draws a page image side by side with clusters filtered into two categories:
103
- - Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
104
- - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
105
- Includes label names and confidence scores for each cluster.
106
- """
107
- scale_x = page.image.width / page.size.width
108
- scale_y = page.image.height / page.size.height
109
-
110
- # Filter clusters for left and right images
111
- exclude_labels = {
112
- DocItemLabel.FORM,
113
- DocItemLabel.KEY_VALUE_REGION,
114
- DocItemLabel.PICTURE,
115
- }
116
- left_clusters = [c for c in clusters if c.label not in exclude_labels]
117
- right_clusters = [c for c in clusters if c.label in exclude_labels]
118
- # Create a deep copy of the original image for both sides
119
- left_image = copy.deepcopy(page.image)
120
- right_image = copy.deepcopy(page.image)
121
-
122
- # Draw clusters on both images
123
- draw_clusters(left_image, left_clusters, scale_x, scale_y)
124
- draw_clusters(right_image, right_clusters, scale_x, scale_y)
125
- # Combine the images side by side
126
- combined_width = left_image.width * 2
127
- combined_height = left_image.height
128
- combined_image = Image.new("RGB", (combined_width, combined_height))
129
- combined_image.paste(left_image, (0, 0))
130
- combined_image.paste(right_image, (left_image.width, 0))
131
- if show:
132
- combined_image.show()
133
- else:
134
- out_path: Path = (
135
- Path(settings.debug.debug_output_path)
136
- / f"debug_{conv_res.input.file.stem}"
137
- )
138
- out_path.mkdir(parents=True, exist_ok=True)
139
- out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
140
- combined_image.save(str(out_file), format="png")
141
-
142
- def __call__(
143
- self, conv_res: ConversionResult, page_batch: Iterable[Page]
144
- ) -> Iterable[Page]:
145
-
146
- for page in page_batch:
147
- assert page._backend is not None
148
- if not page._backend.is_valid():
149
- yield page
150
- else:
151
- with TimeRecorder(conv_res, "layout"):
152
- assert page.size is not None
153
- page_image = page.get_image(scale=1.0)
154
- assert page_image is not None
155
-
156
- clusters = []
157
- for ix, pred_item in enumerate(
158
- self.layout_predictor.predict(page_image)
159
- ):
160
- label = DocItemLabel(
161
- pred_item["label"]
162
- .lower()
163
- .replace(" ", "_")
164
- .replace("-", "_")
165
- ) # Temporary, until docling-ibm-model uses docling-core types
166
- cluster = Cluster(
167
- id=ix,
168
- label=label,
169
- confidence=pred_item["confidence"],
170
- bbox=BoundingBox.model_validate(pred_item),
171
- cells=[],
172
- )
173
- clusters.append(cluster)
174
-
175
- if settings.debug.visualize_raw_layout:
176
- self.draw_clusters_and_cells_side_by_side(
177
- conv_res, page, clusters, mode_prefix="raw"
178
- )
179
-
180
- # Apply postprocessing
181
-
182
- processed_clusters, processed_cells = LayoutPostprocessor(
183
- page.cells, clusters, page.size
184
- ).postprocess()
185
- # processed_clusters, processed_cells = clusters, page.cells
186
-
187
- page.cells = processed_cells
188
- page.predictions.layout = LayoutPrediction(
189
- clusters=processed_clusters
190
- )
191
-
192
- if settings.debug.visualize_layout:
193
- self.draw_clusters_and_cells_side_by_side(
194
- conv_res, page, processed_clusters, mode_prefix="postprocessed"
195
- )
196
-
197
- yield page