Spaces:
Running
Running
ZaynZhu
commited on
Commit
·
b447602
1
Parent(s):
44efbff
fix
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Paper2Video/LICENSE +0 -21
- Paper2Video/README-CN.md +0 -248
- Paper2Video/README.md +0 -251
- Paper2Video/__init__.py +0 -0
- Paper2Video/src/__init__.py +0 -0
- Paper2Video/src/evaluation/IPMemory/construct.py +0 -69
- Paper2Video/src/evaluation/IPMemory/ip_qa.py +0 -142
- Paper2Video/src/evaluation/MetaSim_audio.py +0 -102
- Paper2Video/src/evaluation/MetaSim_content.py +0 -144
- Paper2Video/src/evaluation/PresentArena.py +0 -106
- Paper2Video/src/evaluation/PresentQuiz/PresentQuiz.py +0 -264
- Paper2Video/src/evaluation/PresentQuiz/create_paper_questions.py +0 -47
- Paper2Video/src/evaluation/PresentQuiz/docling/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/abstract_backend.py +0 -63
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/asciidoc_backend.py +0 -430
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_backend.py +0 -227
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_v2_backend.py +0 -250
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/html_backend.py +0 -442
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/docling_json_backend.py +0 -58
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/md_backend.py +0 -428
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/msexcel_backend.py +0 -386
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/mspowerpoint_backend.py +0 -424
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/msword_backend.py +0 -582
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/pdf_backend.py +0 -76
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/pypdfium2_backend.py +0 -260
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/pubmed_backend.py +0 -592
- Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/uspto_backend.py +0 -1888
- Paper2Video/src/evaluation/PresentQuiz/docling/chunking/__init__.py +0 -12
- Paper2Video/src/evaluation/PresentQuiz/docling/cli/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/cli/main.py +0 -456
- Paper2Video/src/evaluation/PresentQuiz/docling/cli/models.py +0 -107
- Paper2Video/src/evaluation/PresentQuiz/docling/cli/tools.py +0 -17
- Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/base_models.py +0 -258
- Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/document.py +0 -394
- Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/pipeline_options.py +0 -296
- Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/settings.py +0 -67
- Paper2Video/src/evaluation/PresentQuiz/docling/document_converter.py +0 -348
- Paper2Video/src/evaluation/PresentQuiz/docling/exceptions.py +0 -6
- Paper2Video/src/evaluation/PresentQuiz/docling/models/__init__.py +0 -0
- Paper2Video/src/evaluation/PresentQuiz/docling/models/base_model.py +0 -87
- Paper2Video/src/evaluation/PresentQuiz/docling/models/base_ocr_model.py +0 -189
- Paper2Video/src/evaluation/PresentQuiz/docling/models/code_formula_model.py +0 -251
- Paper2Video/src/evaluation/PresentQuiz/docling/models/document_picture_classifier.py +0 -190
- Paper2Video/src/evaluation/PresentQuiz/docling/models/ds_glm_model.py +0 -386
- Paper2Video/src/evaluation/PresentQuiz/docling/models/easyocr_model.py +0 -177
- Paper2Video/src/evaluation/PresentQuiz/docling/models/layout_model.py +0 -197
Paper2Video/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
MIT License
|
| 2 |
-
|
| 3 |
-
Copyright (c) 2025 Show Lab
|
| 4 |
-
|
| 5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
-
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
-
in the Software without restriction, including without limitation the rights
|
| 8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
-
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
-
furnished to do so, subject to the following conditions:
|
| 11 |
-
|
| 12 |
-
The above copyright notice and this permission notice shall be included in all
|
| 13 |
-
copies or substantial portions of the Software.
|
| 14 |
-
|
| 15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/README-CN.md
DELETED
|
@@ -1,248 +0,0 @@
|
|
| 1 |
-
# Paper2Video
|
| 2 |
-
|
| 3 |
-
<p align="right">
|
| 4 |
-
<a href="./README.md">English</a> | <b>简体中文</b>
|
| 5 |
-
</p>
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
<p align="center">
|
| 9 |
-
<b>Paper2Video: 从学术论文自动生成演讲视频</b>
|
| 10 |
-
<br>
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
<p align="center">
|
| 14 |
-
<a href="https://zeyu-zhu.github.io/webpage/">Zeyu Zhu*</a>,
|
| 15 |
-
<a href="https://qhlin.me/">Kevin Qinghong Lin*</a>,
|
| 16 |
-
<a href="https://scholar.google.com/citations?user=h1-3lSoAAAAJ&hl=en">Mike Zheng Shou</a> <br>
|
| 17 |
-
新加坡国立大学 Show Lab
|
| 18 |
-
</p>
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
<p align="center">
|
| 22 |
-
<a href="https://arxiv.org/abs/2510.05096">📄 论文</a> |
|
| 23 |
-
<a href="https://huggingface.co/papers/2510.05096">🤗 Daily Paper</a> |
|
| 24 |
-
<a href="https://huggingface.co/datasets/ZaynZhu/Paper2Video">📊 数据集</a> |
|
| 25 |
-
<a href="https://showlab.github.io/Paper2Video/">🌐 项目主页</a> |
|
| 26 |
-
<a href="https://x.com/KevinQHLin/status/1976105129146257542">💬 推特</a>
|
| 27 |
-
</p>
|
| 28 |
-
|
| 29 |
-
- **输入:** 一篇论文 ➕ 一张图像 ➕ 一段音频
|
| 30 |
-
|
| 31 |
-
| 论文 | 图像 | 音频 |
|
| 32 |
-
|--------|--------|--------|
|
| 33 |
-
| <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/paper.png" width="180"/><br>[🔗 论文链接](https://arxiv.org/pdf/1509.01626) | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/hinton_head.jpeg" width="180"/> <br>Hinton的图像| <img src="assets/sound.png" width="180"/><br>[🔗 音频样本](https://github.com/showlab/Paper2Video/blob/page/assets/hinton/ref_audio_10.wav) |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
- **输出:** 演讲视频
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
https://github.com/user-attachments/assets/39221a9a-48cb-4e20-9d1c-080a5d8379c4
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
查看更多生成结果 [🌐 project page](https://showlab.github.io/Paper2Video/).
|
| 46 |
-
|
| 47 |
-
## 🔥 Update
|
| 48 |
-
- [x] [2025.10.11] 我们的工作在[YC Hacker News](https://news.ycombinator.com/item?id=45553701)上受到关注.
|
| 49 |
-
- [x] [2025.10.9] 感谢AK在[Twitter](https://x.com/_akhaliq/status/1976099830004072849)上分享我们的工作!
|
| 50 |
-
- [x] [2025.10.9] 我们的工作被 [Medium](https://medium.com/@dataism/how-ai-learned-to-make-scientific-videos-from-slides-to-a-talking-head-0d807e491b27)报道.
|
| 51 |
-
- [x] [2025.10.8] 下方查看我们的demo视频!
|
| 52 |
-
- [x] [2025.10.7] 我们发布了 [Arxiv 论文](https://arxiv.org/abs/2510.05096).
|
| 53 |
-
- [x] [2025.10.6] 我们发布了 [代码](https://github.com/showlab/Paper2Video) and [数据集](https://huggingface.co/datasets/ZaynZhu/Paper2Video).
|
| 54 |
-
- [x] [2025.9.28] Paper2Video 已经被 **Scaling Environments for Agents Workshop([SEA](https://sea-workshop.github.io/)) at NeurIPS 2025** 接受.
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
https://github.com/user-attachments/assets/a655e3c7-9d76-4c48-b946-1068fdb6cdd9
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
---
|
| 63 |
-
|
| 64 |
-
### Table of Contents
|
| 65 |
-
- [🌟 项目总览](#-项目总览)
|
| 66 |
-
- [🚀 快速上手: PaperTalker](#-快速上手-PaperTalker)
|
| 67 |
-
- [1. 环境配置](#1-环境配置)
|
| 68 |
-
- [2. 大语言模型配置](#2-大语言模型配置)
|
| 69 |
-
- [3. 推理](#3-推理)
|
| 70 |
-
- [📊 评价指标: Paper2Video](#-评价指标-Paper2Video)
|
| 71 |
-
- [😼 乐趣: Paper2Video 生成 Paper2Video 演讲视频](#-乐趣-Paper2Video生成Paper2Video演讲视频)
|
| 72 |
-
- [🙏 致谢](#-致谢)
|
| 73 |
-
- [📌 引用](#-引用)
|
| 74 |
-
---
|
| 75 |
-
|
| 76 |
-
## 🌟 项目总览
|
| 77 |
-
<p align="center">
|
| 78 |
-
<img src="assets/teaser.png" alt="Overview" width="100%">
|
| 79 |
-
</p>
|
| 80 |
-
|
| 81 |
-
这项工作解决了学术演讲的两个核心问题:
|
| 82 |
-
|
| 83 |
-
- **左边: 如何根据论文制作学术演讲?**
|
| 84 |
-
*PaperTalker* — 集成**幻灯片**、**字幕**、**光标**、**语音合成**和**演讲者视频渲染**的多智能体。
|
| 85 |
-
|
| 86 |
-
- **右边: 如何评估学术演讲视频?**
|
| 87 |
-
*Paper2Video* — 一个具有精心设计的指标来评估演示质量的基准。
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
---
|
| 91 |
-
|
| 92 |
-
## 🚀 尝试 PaperTalker 为你的论文制作演讲视频 !
|
| 93 |
-
<p align="center">
|
| 94 |
-
<img src="assets/method.png" alt="Approach" width="100%">
|
| 95 |
-
</p>
|
| 96 |
-
|
| 97 |
-
### 1. 环境配置
|
| 98 |
-
准备Python环境:
|
| 99 |
-
```bash
|
| 100 |
-
cd src
|
| 101 |
-
conda create -n p2v python=3.10
|
| 102 |
-
conda activate p2v
|
| 103 |
-
pip install -r requirements.txt
|
| 104 |
-
conda install -c conda-forge tectonic
|
| 105 |
-
````
|
| 106 |
-
下载所依赖代码,并按照[Hallo2](https://github.com/fudan-generative-vision/hallo2)中的说明下载模型权重。
|
| 107 |
-
```bash
|
| 108 |
-
git clone https://github.com/fudan-generative-vision/hallo2.git
|
| 109 |
-
```
|
| 110 |
-
您需要**单独准备用于 talking-head generation 的环境**,以避免潜在的软件包冲突,请参考<a href="git clone https://github.com/fudan-generative-vision/hallo2.git">Hallo2</a>。安装完成后,使用 `which python` 命令获取 Python 环境路径。
|
| 111 |
-
```bash
|
| 112 |
-
cd hallo2
|
| 113 |
-
conda create -n hallo python=3.10
|
| 114 |
-
conda activate hallo
|
| 115 |
-
pip install -r requirements.txt
|
| 116 |
-
```
|
| 117 |
-
|
| 118 |
-
### 2. 大语言模型配置
|
| 119 |
-
在终端配置您的**API 凭证**:
|
| 120 |
-
```bash
|
| 121 |
-
export GEMINI_API_KEY="your_gemini_key_here"
|
| 122 |
-
export OPENAI_API_KEY="your_openai_key_here"
|
| 123 |
-
```
|
| 124 |
-
最佳实践是针对 LLM 和 VLM 使用 **GPT4.1** 或 **Gemini2.5-Pro**。我们也支持本地部署开源模型(例如 Qwen),详情请参阅 <a href="https://github.com/Paper2Poster/Paper2Poster.git">Paper2Poster</a>。
|
| 125 |
-
|
| 126 |
-
### 3. 推理
|
| 127 |
-
脚本 `pipeline.py` 提供了一个自动化的学���演示视频生成流程。它以 **LaTeX 论文素材** 和 **参考图像/音频** 作为输入,并经过多个子模块(幻灯片 → 字幕 → 语音 → 光标 → 头部特写)生成完整的演示视频。⚡ 运行此流程的最低推荐 GPU 为 **NVIDIA A6000**,显存 48G。
|
| 128 |
-
|
| 129 |
-
#### 示例用法
|
| 130 |
-
|
| 131 |
-
运行以下命令来启动完整生成:
|
| 132 |
-
|
| 133 |
-
```bash
|
| 134 |
-
python pipeline.py \
|
| 135 |
-
--model_name_t gpt-4.1 \
|
| 136 |
-
--model_name_v gpt-4.1 \
|
| 137 |
-
--model_name_talking hallo2 \
|
| 138 |
-
--result_dir /path/to/output \
|
| 139 |
-
--paper_latex_root /path/to/latex_proj \
|
| 140 |
-
--ref_img /path/to/ref_img.png \
|
| 141 |
-
--ref_audio /path/to/ref_audio.wav \
|
| 142 |
-
--talking_head_env /path/to/hallo2_env \
|
| 143 |
-
--gpu_list [0,1,2,3,4,5,6,7]
|
| 144 |
-
```
|
| 145 |
-
|
| 146 |
-
| 参数名 | 类型 | 默认值 | 说明 |
|
| 147 |
-
|----------|------|---------|-------------|
|
| 148 |
-
| `--model_name_t` | `str` | `gpt-4.1` | 文本大语言模型(LLM) |
|
| 149 |
-
| `--model_name_v` | `str` | `gpt-4.1` | 视觉语言模型(VLM) |
|
| 150 |
-
| `--model_name_talking` | `str` | `hallo2` | Talking Head 模型。目前仅支持 **hallo2** |
|
| 151 |
-
| `--result_dir` | `str` | `/path/to/output` | 输出目录(包括幻灯片、字幕、视频等) |
|
| 152 |
-
| `--paper_latex_root` | `str` | `/path/to/latex_proj` | 论文 LaTeX 项目的根目录 |
|
| 153 |
-
| `--ref_img` | `str` | `/path/to/ref_img.png` | 参考图像(必须为**正方形**人像) |
|
| 154 |
-
| `--ref_audio` | `str` | `/path/to/ref_audio.wav` | 参考音频(建议时长约为 10 秒) |
|
| 155 |
-
| `--ref_text` | `str` | `None` | 可选参考文本(用于字幕风格指导) |
|
| 156 |
-
| `--beamer_templete_prompt` | `str` | `None` | 可选参考文本(用于幻灯片风格指导) |
|
| 157 |
-
| `--gpu_list` | `list[int]` | `""` | GPU 列表,用于并行执行(适用于**光标生成**与 **Talking Head 渲染**) |
|
| 158 |
-
| `--if_tree_search` | `bool` | `True` | 是否启用树搜索(用于幻灯片布局优化) |
|
| 159 |
-
| `--stage` | `str` | `"[0]"` | 需要运行的阶段(例如 `[0]` 表示完整流程,`[1,2,3]` 表示部分阶段) |
|
| 160 |
-
| `--talking_head_env` | `str` | `/path/to/hallo2_env` | Talking Head 生成的 Python 环境路径 |
|
| 161 |
-
---
|
| 162 |
-
|
| 163 |
-
## 📊 评价指标: Paper2Video
|
| 164 |
-
<p align="center">
|
| 165 |
-
<img src="assets/metrics.png" alt="Metrics" width="100%">
|
| 166 |
-
</p>
|
| 167 |
-
|
| 168 |
-
与自然视频生成不同,学术演示视频发挥着高度专业化的作用:它们不仅关乎视觉保真度,更关乎**学术交流**。这使得直接应用视频合成中的传统指标(例如 FVD、IS 或基于 CLIP 的相似度)变得困难。相反,它们的价值在于它们如何有效地**传播研究成果**并**提升学术知名度**。从这个角度来看,我们认为,评判高质量的学术演示视频应该从两个互补的维度进行评判:
|
| 169 |
-
#### 对于观众
|
| 170 |
-
- 视频应**忠实传达论文的核心思想**。
|
| 171 |
-
- 视频应**易于不同受众观看**。
|
| 172 |
-
|
| 173 |
-
#### 对于作者
|
| 174 |
-
- 视频应**突出作者的智力贡献和身份**。
|
| 175 |
-
- 视频应**提升作品的知名度和影响力**。
|
| 176 |
-
|
| 177 |
-
为了实现这些目标,我们引入了专门为学术演示视频设计的评估指标:Meta Similarity, PresentArena, PresentQuiz, IP Memory.
|
| 178 |
-
|
| 179 |
-
### 运行评价
|
| 180 |
-
- 准备环境:
|
| 181 |
-
```bash
|
| 182 |
-
cd src/evaluation
|
| 183 |
-
conda create -n p2v_e python=3.10
|
| 184 |
-
conda activate p2v_e
|
| 185 |
-
pip install -r requirements.txt
|
| 186 |
-
```
|
| 187 |
-
- 对于 Meta Similarity 和 PresentArena:
|
| 188 |
-
```bash
|
| 189 |
-
python MetaSim_audio.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 190 |
-
python MetaSim_content.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 191 |
-
```
|
| 192 |
-
```bash
|
| 193 |
-
python PresentArena.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 194 |
-
```
|
| 195 |
-
- 对于**PresentQuiz**,首先基于论文生成问题并使用 Gemini 进行评估:
|
| 196 |
-
```bash
|
| 197 |
-
cd PresentQuiz
|
| 198 |
-
python create_paper_questions.py ----paper_folder /path/to/data
|
| 199 |
-
python PresentQuiz.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 200 |
-
```
|
| 201 |
-
|
| 202 |
-
- 对于**IP Memory**,首先从生成的视频中生成问题对,然后使用 Gemini 进行评估:
|
| 203 |
-
```bash
|
| 204 |
-
cd IPMemory
|
| 205 |
-
python construct.py
|
| 206 |
-
python ip_qa.py
|
| 207 |
-
```
|
| 208 |
-
更多详情请查看代码!
|
| 209 |
-
|
| 210 |
-
👉 Paper2Video 数据集可在以下网址获取:
|
| 211 |
-
[HuggingFace](https://huggingface.co/datasets/ZaynZhu/Paper2Video)
|
| 212 |
-
|
| 213 |
-
---
|
| 214 |
-
|
| 215 |
-
## 😼 乐趣: Paper2Video 生成 Paper2Video 演讲视频
|
| 216 |
-
查看 **Paper2Video 生成 Paper2Video 演讲视频**:
|
| 217 |
-
|
| 218 |
-
https://github.com/user-attachments/assets/ff58f4d8-8376-4e12-b967-711118adf3c4
|
| 219 |
-
|
| 220 |
-
## 🙏 致谢
|
| 221 |
-
|
| 222 |
-
* 数据集中演示视频的来源是 SlideLive 和 YouTube。
|
| 223 |
-
* 感谢所有为制作演示视频付出辛勤努力的作者!
|
| 224 |
-
* 感谢 [CAMEL](https://github.com/camel-ai/camel) 开源了组织良好的多智能体框架代码库。
|
| 225 |
-
* 感谢 [Hallo2](https://github.com/fudan-generative-vision/hallo2.git) 和 [Paper2Poster](https://github.com/Paper2Poster/Paper2Poster.git) 作者开源代码。
|
| 226 |
-
* 感谢 [Wei Jia](https://github.com/weeadd) 在数据收集和baselines实现方面所做的努力。我们也感谢所有参与用户调研的参与者。
|
| 227 |
-
* 感谢所有 **Show Lab @ NUS** 成员的支持!
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
---
|
| 232 |
-
|
| 233 |
-
## ��� 引用
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
如果我们的工作对您有帮助,欢迎引用我们的工作:
|
| 237 |
-
|
| 238 |
-
```bibtex
|
| 239 |
-
@misc{paper2video,
|
| 240 |
-
title={Paper2Video: Automatic Video Generation from Scientific Papers},
|
| 241 |
-
author={Zeyu Zhu and Kevin Qinghong Lin and Mike Zheng Shou},
|
| 242 |
-
year={2025},
|
| 243 |
-
eprint={2510.05096},
|
| 244 |
-
archivePrefix={arXiv},
|
| 245 |
-
primaryClass={cs.CV},
|
| 246 |
-
url={https://arxiv.org/abs/2510.05096},
|
| 247 |
-
}
|
| 248 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/README.md
DELETED
|
@@ -1,251 +0,0 @@
|
|
| 1 |
-
# Paper2Video
|
| 2 |
-
|
| 3 |
-
<p align="right">
|
| 4 |
-
<b>English</b> | <a href="./README-CN.md">简体中文</a>
|
| 5 |
-
</p>
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
<p align="center">
|
| 9 |
-
<b>Paper2Video: Automatic Video Generation from Scientific Papers</b>
|
| 10 |
-
<br>
|
| 11 |
-
从学术论文自动生成演讲视频
|
| 12 |
-
</p>
|
| 13 |
-
|
| 14 |
-
<p align="center">
|
| 15 |
-
<a href="https://zeyu-zhu.github.io/webpage/">Zeyu Zhu*</a>,
|
| 16 |
-
<a href="https://qhlin.me/">Kevin Qinghong Lin*</a>,
|
| 17 |
-
<a href="https://scholar.google.com/citations?user=h1-3lSoAAAAJ&hl=en">Mike Zheng Shou</a> <br>
|
| 18 |
-
Show Lab, National University of Singapore
|
| 19 |
-
</p>
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
<p align="center">
|
| 23 |
-
<a href="https://arxiv.org/abs/2510.05096">📄 Paper</a> |
|
| 24 |
-
<a href="https://huggingface.co/papers/2510.05096">🤗 Daily Paper</a> |
|
| 25 |
-
<a href="https://huggingface.co/datasets/ZaynZhu/Paper2Video">📊 Dataset</a> |
|
| 26 |
-
<a href="https://showlab.github.io/Paper2Video/">🌐 Project Website</a> |
|
| 27 |
-
<a href="https://x.com/KevinQHLin/status/1976105129146257542">💬 X (Twitter)</a>
|
| 28 |
-
</p>
|
| 29 |
-
|
| 30 |
-
- **Input:** a paper ➕ an image ➕ an audio
|
| 31 |
-
|
| 32 |
-
| Paper | Image | Audio |
|
| 33 |
-
|--------|--------|--------|
|
| 34 |
-
| <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/paper.png" width="180"/><br>[🔗 Paper link](https://arxiv.org/pdf/1509.01626) | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/hinton_head.jpeg" width="180"/> <br>Hinton's photo| <img src="assets/sound.png" width="180"/><br>[🔗 Audio sample](https://github.com/showlab/Paper2Video/blob/page/assets/hinton/ref_audio_10.wav) |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
- **Output:** a presentation video
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
https://github.com/user-attachments/assets/39221a9a-48cb-4e20-9d1c-080a5d8379c4
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
Check out more examples at [🌐 project page](https://showlab.github.io/Paper2Video/).
|
| 47 |
-
|
| 48 |
-
## 🔥 Update
|
| 49 |
-
- [x] [2025.10.11] Our work receives attention on [YC Hacker News](https://news.ycombinator.com/item?id=45553701).
|
| 50 |
-
- [x] [2025.10.9] Thanks AK for sharing our work on [Twitter](https://x.com/_akhaliq/status/1976099830004072849)!
|
| 51 |
-
- [x] [2025.10.9] Our work is reported by [Medium](https://medium.com/@dataism/how-ai-learned-to-make-scientific-videos-from-slides-to-a-talking-head-0d807e491b27).
|
| 52 |
-
- [x] [2025.10.8] Check out our demo video below!
|
| 53 |
-
- [x] [2025.10.7] We release the [arxiv paper](https://arxiv.org/abs/2510.05096).
|
| 54 |
-
- [x] [2025.10.6] We release the [code](https://github.com/showlab/Paper2Video) and [dataset](https://huggingface.co/datasets/ZaynZhu/Paper2Video).
|
| 55 |
-
- [x] [2025.9.28] Paper2Video has been accepted to the **Scaling Environments for Agents Workshop([SEA](https://sea-workshop.github.io/)) at NeurIPS 2025**.
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
https://github.com/user-attachments/assets/a655e3c7-9d76-4c48-b946-1068fdb6cdd9
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
---
|
| 64 |
-
|
| 65 |
-
### Table of Contents
|
| 66 |
-
- [🌟 Overview](#-overview)
|
| 67 |
-
- [🚀 Quick Start: PaperTalker](#-try-papertalker-for-your-paper-)
|
| 68 |
-
- [1. Requirements](#1-requirements)
|
| 69 |
-
- [2. Configure LLMs](#2-configure-llms)
|
| 70 |
-
- [3. Inference](#3-inference)
|
| 71 |
-
- [📊 Evaluation: Paper2Video](#-evaluation-paper2video)
|
| 72 |
-
- [😼 Fun: Paper2Video for Paper2Video](#-fun-paper2video-for-paper2video)
|
| 73 |
-
- [🙏 Acknowledgements](#-acknowledgements)
|
| 74 |
-
- [📌 Citation](#-citation)
|
| 75 |
-
|
| 76 |
-
---
|
| 77 |
-
|
| 78 |
-
## 🌟 Overview
|
| 79 |
-
<p align="center">
|
| 80 |
-
<img src="assets/teaser.png" alt="Overview" width="100%">
|
| 81 |
-
</p>
|
| 82 |
-
|
| 83 |
-
This work solves two core problems for academic presentations:
|
| 84 |
-
|
| 85 |
-
- **Left: How to create a presentation video from a paper?**
|
| 86 |
-
*PaperTalker* — an agent that integrates **slides**, **subtitling**, **cursor grounding**, **speech synthesis**, and **talking-head video rendering**.
|
| 87 |
-
|
| 88 |
-
- **Right: How to evaluate a presentation video?**
|
| 89 |
-
*Paper2Video* — a benchmark with well-designed metrics to evaluate presentation quality.
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
---
|
| 93 |
-
|
| 94 |
-
## 🚀 Try PaperTalker for your Paper!
|
| 95 |
-
<p align="center">
|
| 96 |
-
<img src="assets/method.png" alt="Approach" width="100%">
|
| 97 |
-
</p>
|
| 98 |
-
|
| 99 |
-
### 1. Requirements
|
| 100 |
-
Prepare the environment:
|
| 101 |
-
```bash
|
| 102 |
-
cd src
|
| 103 |
-
conda create -n p2v python=3.10
|
| 104 |
-
conda activate p2v
|
| 105 |
-
pip install -r requirements.txt
|
| 106 |
-
conda install -c conda-forge tectonic
|
| 107 |
-
````
|
| 108 |
-
Download the dependent code and follow the instructions in **[Hallo2](https://github.com/fudan-generative-vision/hallo2)** to download the model weight.
|
| 109 |
-
```bash
|
| 110 |
-
git clone https://github.com/fudan-generative-vision/hallo2.git
|
| 111 |
-
```
|
| 112 |
-
You need to **prepare the environment separately for talking-head generation** to potential avoide package conflicts, please refer to <a href="git clone https://github.com/fudan-generative-vision/hallo2.git">Hallo2</a>. After installing, use `which python` to get the python environment path.
|
| 113 |
-
```bash
|
| 114 |
-
cd hallo2
|
| 115 |
-
conda create -n hallo python=3.10
|
| 116 |
-
conda activate hallo
|
| 117 |
-
pip install -r requirements.txt
|
| 118 |
-
```
|
| 119 |
-
|
| 120 |
-
### 2. Configure LLMs
|
| 121 |
-
Export your **API credentials**:
|
| 122 |
-
```bash
|
| 123 |
-
export GEMINI_API_KEY="your_gemini_key_here"
|
| 124 |
-
export OPENAI_API_KEY="your_openai_key_here"
|
| 125 |
-
```
|
| 126 |
-
The best practice is to use **GPT4.1** or **Gemini2.5-Pro** for both LLM and VLMs. We also support locally deployed open-source model(e.g., Qwen), details please referring to <a href="https://github.com/Paper2Poster/Paper2Poster.git">Paper2Poster</a>.
|
| 127 |
-
|
| 128 |
-
### 3. Inference
|
| 129 |
-
The script `pipeline.py` provides an automated pipeline for generating academic presentation videos. It takes **LaTeX paper sources** together with **reference image/audio** as input, and goes through multiple sub-modules (Slides → Subtitles → Speech → Cursor → Talking Head) to produce a complete presentation video. ⚡ The minimum recommended GPU for running this pipeline is **NVIDIA A6000** with 48G.
|
| 130 |
-
|
| 131 |
-
#### Example Usage
|
| 132 |
-
|
| 133 |
-
Run the following command to launch a full generation:
|
| 134 |
-
|
| 135 |
-
```bash
|
| 136 |
-
python pipeline.py \
|
| 137 |
-
--model_name_t gpt-4.1 \
|
| 138 |
-
--model_name_v gpt-4.1 \
|
| 139 |
-
--model_name_talking hallo2 \
|
| 140 |
-
--result_dir /path/to/output \
|
| 141 |
-
--paper_latex_root /path/to/latex_proj \
|
| 142 |
-
--ref_img /path/to/ref_img.png \
|
| 143 |
-
--ref_audio /path/to/ref_audio.wav \
|
| 144 |
-
--talking_head_env /path/to/hallo2_env \
|
| 145 |
-
--gpu_list [0,1,2,3,4,5,6,7]
|
| 146 |
-
```
|
| 147 |
-
|
| 148 |
-
| Argument | Type | Default | Description |
|
| 149 |
-
|----------|------|---------|-------------|
|
| 150 |
-
| `--model_name_t` | `str` | `gpt-4.1` | LLM |
|
| 151 |
-
| `--model_name_v` | `str` | `gpt-4.1` | VLM |
|
| 152 |
-
| `--model_name_talking` | `str` | `hallo2` | Talking Head model. Currently only **hallo2** is supported |
|
| 153 |
-
| `--result_dir` | `str` | `/path/to/output` | Output directory (slides, subtitles, videos, etc.) |
|
| 154 |
-
| `--paper_latex_root` | `str` | `/path/to/latex_proj` | Root directory of the LaTeX paper project |
|
| 155 |
-
| `--ref_img` | `str` | `/path/to/ref_img.png` | Reference image (must be **square** portrait) |
|
| 156 |
-
| `--ref_audio` | `str` | `/path/to/ref_audio.wav` | Reference audio (recommended: ~10s) |
|
| 157 |
-
| `--ref_text` | `str` | `None` | Optional reference text (for style guidance for subtitles) |
|
| 158 |
-
| `--beamer_templete_prompt` | `str` | `None` | Optional reference text (for style guidance for slides) |
|
| 159 |
-
| `--gpu_list` | `list[int]` | `""` | GPU list for parallel execution (used in **cursor generation** and **Talking Head rendering**) |
|
| 160 |
-
| `--if_tree_search` | `bool` | `True` | Whether to enable tree search for slide layout refinement |
|
| 161 |
-
| `--stage` | `str` | `"[0]"` | Pipeline stages to run (e.g., `[0]` full pipeline, `[1,2,3]` partial stages) |
|
| 162 |
-
| `--talking_head_env` | `str` | `/path/to/hallo2_env` | python environment path for talking-head generation |
|
| 163 |
-
---
|
| 164 |
-
|
| 165 |
-
## 📊 Evaluation: Paper2Video
|
| 166 |
-
<p align="center">
|
| 167 |
-
<img src="assets/metrics.png" alt="Metrics" width="100%">
|
| 168 |
-
</p>
|
| 169 |
-
|
| 170 |
-
Unlike natural video generation, academic presentation videos serve a highly specialized role: they are not merely about visual fidelity but about **communicating scholarship**. This makes it difficult to directly apply conventional metrics from video synthesis(e.g., FVD, IS, or CLIP-based similarity). Instead, their value lies in how well they **disseminate research** and **amplify scholarly visibility**.From this perspective, we argue that a high-quality academic presentation video should be judged along two complementary dimensions:
|
| 171 |
-
#### For the Audience
|
| 172 |
-
- The video is expected to **faithfully convey the paper’s core ideas**.
|
| 173 |
-
- It should remain **accessible to diverse audiences**.
|
| 174 |
-
|
| 175 |
-
#### For the Author
|
| 176 |
-
- The video should **foreground the authors’ intellectual contribution and identity**.
|
| 177 |
-
- It should **enhance the work’s visibility and impact**.
|
| 178 |
-
|
| 179 |
-
To capture these goals, we introduce evaluation metrics specifically designed for academic presentation videos: Meta Similarity, PresentArena, PresentQuiz, IP Memory.
|
| 180 |
-
|
| 181 |
-
### Run Eval
|
| 182 |
-
- Prepare the environment:
|
| 183 |
-
```bash
|
| 184 |
-
cd src/evaluation
|
| 185 |
-
conda create -n p2v_e python=3.10
|
| 186 |
-
conda activate p2v_e
|
| 187 |
-
pip install -r requirements.txt
|
| 188 |
-
```
|
| 189 |
-
- For MetaSimilarity and PresentArena:
|
| 190 |
-
```bash
|
| 191 |
-
python MetaSim_audio.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 192 |
-
python MetaSim_content.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 193 |
-
```
|
| 194 |
-
```bash
|
| 195 |
-
python PresentArena.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 196 |
-
```
|
| 197 |
-
- For **PresentQuiz**, first generate questions from paper and eval using Gemini:
|
| 198 |
-
```bash
|
| 199 |
-
cd PresentQuiz
|
| 200 |
-
python create_paper_questions.py ----paper_folder /path/to/data
|
| 201 |
-
python PresentQuiz.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
|
| 202 |
-
```
|
| 203 |
-
|
| 204 |
-
- For **IP Memory**, first generate question pairs from generated videos and eval using Gemini:
|
| 205 |
-
```bash
|
| 206 |
-
cd IPMemory
|
| 207 |
-
python construct.py
|
| 208 |
-
python ip_qa.py
|
| 209 |
-
```
|
| 210 |
-
See the codes for more details!
|
| 211 |
-
|
| 212 |
-
👉 Paper2Video Benchmark is available at:
|
| 213 |
-
[HuggingFace](https://huggingface.co/datasets/ZaynZhu/Paper2Video)
|
| 214 |
-
|
| 215 |
-
---
|
| 216 |
-
|
| 217 |
-
## 😼 Fun: Paper2Video for Paper2Video
|
| 218 |
-
Check out **How Paper2Video for Paper2Video**:
|
| 219 |
-
|
| 220 |
-
https://github.com/user-attachments/assets/ff58f4d8-8376-4e12-b967-711118adf3c4
|
| 221 |
-
|
| 222 |
-
## 🙏 Acknowledgements
|
| 223 |
-
|
| 224 |
-
* The souces of the presentation videos are SlideLive and YouTuBe.
|
| 225 |
-
* We thank all the authors who spend a great effort to create presentation videos!
|
| 226 |
-
* We thank [CAMEL](https://github.com/camel-ai/camel) for open-source well-organized multi-agent framework codebase.
|
| 227 |
-
* We thank the authors of [Hallo2](https://github.com/fudan-generative-vision/hallo2.git) and [Paper2Poster](https://github.com/Paper2Poster/Paper2Poster.git) for their open-sourced codes.
|
| 228 |
-
* We thank [Wei Jia](https://github.com/weeadd) for his effort in collecting the data and implementing the baselines. We also thank all the participants involved in the human studies.
|
| 229 |
-
* We thank all the **Show Lab @ NUS** members for support!
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
---
|
| 234 |
-
|
| 235 |
-
## 📌 Citation
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
If you find our work useful, please cite:
|
| 239 |
-
|
| 240 |
-
```bibtex
|
| 241 |
-
@misc{paper2video,
|
| 242 |
-
title={Paper2Video: Automatic Video Generation from Scientific Papers},
|
| 243 |
-
author={Zeyu Zhu and Kevin Qinghong Lin and Mike Zheng Shou},
|
| 244 |
-
year={2025},
|
| 245 |
-
eprint={2510.05096},
|
| 246 |
-
archivePrefix={arXiv},
|
| 247 |
-
primaryClass={cs.CV},
|
| 248 |
-
url={https://arxiv.org/abs/2510.05096},
|
| 249 |
-
}
|
| 250 |
-
```
|
| 251 |
-
[](https://star-history.com/#showlab/Paper2Video&Date)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/IPMemory/construct.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
construct question about Academic IP
|
| 3 |
-
input query: 4 video clips from 4 different paper presentation + query (image/audio)
|
| 4 |
-
input question: 4 understanding qa from corresponding paper
|
| 5 |
-
output task: choose the right question to ask
|
| 6 |
-
"""
|
| 7 |
-
import os, re
|
| 8 |
-
import json
|
| 9 |
-
import random
|
| 10 |
-
import itertools
|
| 11 |
-
from os import path
|
| 12 |
-
from typing import List
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
from tqdm import tqdm
|
| 15 |
-
|
| 16 |
-
def generate_combinations(total_num, comb_size):
|
| 17 |
-
return list(itertools.combinations(range(total_num), comb_size))
|
| 18 |
-
|
| 19 |
-
def generate_ip_task(vaild_data_name, num_qa_pair):
|
| 20 |
-
combs = list(itertools.combinations(range(len(vaild_data_name)), 4))
|
| 21 |
-
combs = random.sample(combs, num_qa_pair)
|
| 22 |
-
|
| 23 |
-
qa_list = []
|
| 24 |
-
for comb in combs:
|
| 25 |
-
## questions
|
| 26 |
-
question_list = []
|
| 27 |
-
question_index = random.randint(1, 50)
|
| 28 |
-
for index in comb:
|
| 29 |
-
question_path = path.join(vaild_data_name[index][1], "4o-mini_qa.json")
|
| 30 |
-
with open(question_path, 'r') as f: question = json.load(f)["understanding"]["questions"]
|
| 31 |
-
question_list.append(question["Question {}".format(str(question_index))]["question"])
|
| 32 |
-
## query
|
| 33 |
-
query_list = []
|
| 34 |
-
for index in comb:
|
| 35 |
-
ref_img_path = path.join(vaild_data_name[index][1], "ref_img.png")
|
| 36 |
-
ref_audio_path = path.join(vaild_data_name[index][1], "ref_audio.wav")
|
| 37 |
-
query_list.append((ref_img_path, ref_audio_path))
|
| 38 |
-
## qa
|
| 39 |
-
qa = {}
|
| 40 |
-
qa["videos"] = []
|
| 41 |
-
for idx in range(len(comb)):
|
| 42 |
-
qa["videos"].append(vaild_data_name[comb[idx]][0])
|
| 43 |
-
|
| 44 |
-
qa["querys"] = query_list
|
| 45 |
-
qa["questions"] = question_list
|
| 46 |
-
qa_list.append(qa)
|
| 47 |
-
with open("ip_qa.json", 'w') as f: json.dump(qa_list, f, indent=4)
|
| 48 |
-
|
| 49 |
-
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
|
| 50 |
-
def sort_by_leading_number(paths: List[str]) -> List[str]:
|
| 51 |
-
def key(p: str):
|
| 52 |
-
name = Path(p).name
|
| 53 |
-
m = _num_at_start.match(name)
|
| 54 |
-
return (int(m.group(1)) if m else float('inf'), name)
|
| 55 |
-
return sorted(paths, key=key)
|
| 56 |
-
|
| 57 |
-
if __name__ == "__main__":
|
| 58 |
-
num_qa_pair = 10 # C (num_data) (4)
|
| 59 |
-
root_dir = "/path/to/result"
|
| 60 |
-
gt_dir = "/path/to/data"
|
| 61 |
-
|
| 62 |
-
all_data_name = sort_by_leading_number(os.listdir(root_dir))
|
| 63 |
-
all_groundtruth = sort_by_leading_number(os.listdir(gt_dir))
|
| 64 |
-
vaild_data_name = []
|
| 65 |
-
for data_idx in range(len(all_data_name)):
|
| 66 |
-
if path.basename(root_dir) == "paper2video":
|
| 67 |
-
video_result_1 = path.join(root_dir, all_data_name[data_idx], "3_merage.mp4")
|
| 68 |
-
video_result_2 = path.join(root_dir.replace("paper2video", "presentagent"), all_data_name[data_idx], "result.mp4")
|
| 69 |
-
generate_ip_task(vaild_data_name, num_qa_pair)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/IPMemory/ip_qa.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
import json
|
| 4 |
-
import time
|
| 5 |
-
import random
|
| 6 |
-
import argparse, pdb
|
| 7 |
-
from os import path
|
| 8 |
-
import google.generativeai as genai
|
| 9 |
-
from moviepy.editor import VideoFileClip
|
| 10 |
-
from camel.models import ModelFactory
|
| 11 |
-
from camel.types import ModelType, ModelPlatformType
|
| 12 |
-
from camel.configs import GeminiConfig
|
| 13 |
-
from typing import List
|
| 14 |
-
from pathlib import Path
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
genai.configure(api_key="")
|
| 18 |
-
|
| 19 |
-
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
|
| 20 |
-
def sort_by_leading_number(paths: List[str]) -> List[str]:
|
| 21 |
-
def key(p: str):
|
| 22 |
-
name = Path(p).name
|
| 23 |
-
m = _num_at_start.match(name)
|
| 24 |
-
return (int(m.group(1)) if m else float('inf'), name)
|
| 25 |
-
return sorted(paths, key=key)
|
| 26 |
-
dataset_path = "/path/to/data"
|
| 27 |
-
dataset_list = sort_by_leading_number(os.listdir(dataset_path))
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def eval_ip(root_path, clip_duration, model_list, prompt_path, question_path, test_type='image'):
|
| 31 |
-
tmp_dir = "tmp"
|
| 32 |
-
os.makedirs(tmp_dir, exist_ok=True)
|
| 33 |
-
gemini_model = genai.GenerativeModel("models/gemini-2.5-pro-flash")
|
| 34 |
-
|
| 35 |
-
with open(prompt_path, 'r') as f: prompt = f.readlines()
|
| 36 |
-
prompt = "/n".join(prompt)
|
| 37 |
-
with open(question_path, 'r') as f: questions = json.load(f)
|
| 38 |
-
|
| 39 |
-
result_each_question = []
|
| 40 |
-
for question in questions:
|
| 41 |
-
video_ids = question["videos"]
|
| 42 |
-
querys = question["querys"]
|
| 43 |
-
qs = question["questions"]
|
| 44 |
-
|
| 45 |
-
## get video clips
|
| 46 |
-
video_clips_path = {}
|
| 47 |
-
for model in model_list: video_clips_path[model] = []
|
| 48 |
-
|
| 49 |
-
start_p2v = None
|
| 50 |
-
for vid_id in video_ids:
|
| 51 |
-
tmp_dir_id = path.join(tmp_dir, str(vid_id))
|
| 52 |
-
os.makedirs(tmp_dir_id, exist_ok=True)
|
| 53 |
-
for model in model_list:
|
| 54 |
-
if model == 'p2v': video_path = path.join(root_path, "paper2video", str(vid_id), '3_merage.mp4')
|
| 55 |
-
elif model == 'p2v-o': video_path = path.join(root_path, "paper2video_wo_presenter", str(vid_id), 'result.mp4')
|
| 56 |
-
elif model == 'veo3': video_path = path.join(root_path, "veo3", str(vid_id)+".mp4")
|
| 57 |
-
elif model == 'wan2.2': video_path = path.join(root_path, "wan2.2", str(int(vid_id)-1), "result.mp4")
|
| 58 |
-
elif model == 'presentagent': video_path = path.join(root_path, "presentagent", str(vid_id), "result.mp4")
|
| 59 |
-
elif model == 'human-made': video_path = path.join(dataset_path, dataset_list[int(vid_id)-1], "gt_presentation_video.mp4")
|
| 60 |
-
|
| 61 |
-
video = VideoFileClip(video_path)
|
| 62 |
-
start = random.uniform(0, video.duration-clip_duration-1)
|
| 63 |
-
end = start + clip_duration
|
| 64 |
-
if model == 'p2v' or model == "p2v-o":
|
| 65 |
-
if start_p2v is None:
|
| 66 |
-
start_p2v = random.uniform(0, video.duration-clip_duration-1)
|
| 67 |
-
start = start_p2v
|
| 68 |
-
end = start_p2v + clip_duration
|
| 69 |
-
else:
|
| 70 |
-
start = start_p2v
|
| 71 |
-
end = start_p2v + clip_duration
|
| 72 |
-
else:
|
| 73 |
-
start = random.uniform(0, video.duration-clip_duration-1)
|
| 74 |
-
end = start + clip_duration
|
| 75 |
-
|
| 76 |
-
clip_save_path = path.join(tmp_dir_id, model+".mp4")
|
| 77 |
-
subclip = video.subclip(start, end)
|
| 78 |
-
subclip.write_videofile(clip_save_path, codec="libx264", audio_codec="aac")
|
| 79 |
-
video_clips_path[model].append(clip_save_path)
|
| 80 |
-
## test for each model, 4 qas
|
| 81 |
-
result_each_model = {}
|
| 82 |
-
for model in model_list:
|
| 83 |
-
video_input = video_clips_path[model]
|
| 84 |
-
videos = upload_videos(video_input)
|
| 85 |
-
result_each_model[model] = []
|
| 86 |
-
for idx, query in enumerate(querys):
|
| 87 |
-
if test_type == 'image':
|
| 88 |
-
query = query[0]
|
| 89 |
-
query_state = genai.upload_file(path=query, mime_type="image/png")
|
| 90 |
-
elif test_type == 'aduio':
|
| 91 |
-
query = query[1]
|
| 92 |
-
|
| 93 |
-
answer = idx
|
| 94 |
-
ori_idxs = [0, 1, 2, 3]
|
| 95 |
-
shuffled_idx = ori_idxs.copy()
|
| 96 |
-
random.shuffle(shuffled_idx)
|
| 97 |
-
mapping = {orig: shuffled for orig, shuffled in zip(ori_idxs, shuffled_idx)}
|
| 98 |
-
new_answer = mapping[idx]
|
| 99 |
-
new_qs = [qs[mapping[idx]] for idx in ori_idxs]
|
| 100 |
-
|
| 101 |
-
contents = [prompt, "Here are the quary", genai.get_file(query_state.name), "Here are the video clips"]
|
| 102 |
-
contents.extend(videos)
|
| 103 |
-
contents.extend(["Here are the questions"])
|
| 104 |
-
contents.extend(new_qs)
|
| 105 |
-
|
| 106 |
-
response = gemini_model.generate_content(contents)
|
| 107 |
-
#pdb.set_trace()
|
| 108 |
-
match = re.search(r"My choice:\s*(\d+)", response.text)
|
| 109 |
-
if match: choice_num = int(match.group(1)) - 1
|
| 110 |
-
if choice_num == new_answer:
|
| 111 |
-
result_each_model[model].append([query, new_qs, choice_num, new_answer, True])
|
| 112 |
-
else:
|
| 113 |
-
result_each_model[model].append([query, new_qs, choice_num, new_answer, False])
|
| 114 |
-
result_each_question.append(result_each_model)
|
| 115 |
-
print(result_each_question)
|
| 116 |
-
with open("ip_qa_result.json", 'w') as f: json.dump(result_each_question, f, indent=4)
|
| 117 |
-
|
| 118 |
-
def upload_videos(video_list):
|
| 119 |
-
videos = video_list.copy()
|
| 120 |
-
for idx, value in enumerate(videos):
|
| 121 |
-
videos[idx] = genai.upload_file(path=value, mime_type="video/mp4")
|
| 122 |
-
while True:
|
| 123 |
-
flag = True
|
| 124 |
-
for idx, value in enumerate(videos):
|
| 125 |
-
file_state = genai.get_file(videos[idx].name)
|
| 126 |
-
if file_state.state.name != "ACTIVE":
|
| 127 |
-
flag = False
|
| 128 |
-
time.sleep(5)
|
| 129 |
-
print(f"waiting 5 seconds...")
|
| 130 |
-
break
|
| 131 |
-
if flag: break
|
| 132 |
-
for idx, value in enumerate(videos):
|
| 133 |
-
videos[idx] = genai.get_file(videos[idx].name)
|
| 134 |
-
return videos
|
| 135 |
-
|
| 136 |
-
if __name__ == "__main__":
|
| 137 |
-
clip_duration = 4
|
| 138 |
-
prompt_path = "./prompt/ip_qa.txt"
|
| 139 |
-
model_list = ["p2v", "p2v-o", "veo3", "wan2.2", "presentagent", "human-made"]
|
| 140 |
-
root_path = "/path/to/result"
|
| 141 |
-
question_path = "ip_qa.json"
|
| 142 |
-
eval_ip(root_path, clip_duration, model_list, prompt_path, question_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/MetaSim_audio.py
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
import os, re, json
|
| 2 |
-
import random
|
| 3 |
-
import argparse
|
| 4 |
-
import moviepy.editor as mp
|
| 5 |
-
from os import path
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
from typing import List
|
| 8 |
-
from pyannote.audio import Audio
|
| 9 |
-
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
|
| 10 |
-
from scipy.spatial.distance import cosine
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def extract_random_audio_segment(video_path: str, output_wav_path: str, duration: float = 5.0):
|
| 14 |
-
print(video_path)
|
| 15 |
-
video = mp.VideoFileClip(video_path)
|
| 16 |
-
audio = video.audio
|
| 17 |
-
|
| 18 |
-
total_duration = audio.duration
|
| 19 |
-
if duration >= total_duration: start_time = 0
|
| 20 |
-
else: start_time = random.uniform(0, total_duration - duration)
|
| 21 |
-
|
| 22 |
-
audio_subclip = audio.subclip(start_time, start_time + duration)
|
| 23 |
-
audio_subclip.write_audiofile(output_wav_path, codec='pcm_s16le', fps=16000)
|
| 24 |
-
|
| 25 |
-
def compute_speaker_similarity(audio_path_1: str, audio_path_2: str, device: str = "cuda") -> float:
|
| 26 |
-
embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=device)
|
| 27 |
-
audio_loader = Audio(sample_rate=16000)
|
| 28 |
-
|
| 29 |
-
wav1, _ = audio_loader(audio_path_1)
|
| 30 |
-
wav2, _ = audio_loader(audio_path_2)
|
| 31 |
-
|
| 32 |
-
wav1 = wav1[0:1].unsqueeze(0)
|
| 33 |
-
wav2 = wav2[0:1].unsqueeze(0)
|
| 34 |
-
|
| 35 |
-
embedding1 = embedding_model(wav1)
|
| 36 |
-
embedding2 = embedding_model(wav2)
|
| 37 |
-
embedding1 = embedding1.reshape(embedding1.shape[1])
|
| 38 |
-
embedding2 = embedding2.reshape(embedding2.shape[1])
|
| 39 |
-
|
| 40 |
-
similarity = 1 - cosine(embedding1, embedding2)
|
| 41 |
-
return similarity
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def get_audio_sim_score(gen_video_path, gt_video_path):
|
| 45 |
-
extract_random_audio_segment(gen_video_path, gen_video_path.replace('.mp4', '.wav'), duration=5)
|
| 46 |
-
extract_random_audio_segment(gt_video_path, gt_video_path.replace('.mp4', '.wav'), duration=5)
|
| 47 |
-
similarity = compute_speaker_similarity(gen_video_path.replace('.mp4', '.wav'),
|
| 48 |
-
gt_video_path.replace('.mp4', '.wav'))
|
| 49 |
-
return similarity
|
| 50 |
-
|
| 51 |
-
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
|
| 52 |
-
def sort_by_leading_number(paths: List[str]) -> List[str]:
|
| 53 |
-
def key(p: str):
|
| 54 |
-
name = Path(p).name
|
| 55 |
-
m = _num_at_start.match(name)
|
| 56 |
-
return (int(m.group(1)) if m else float('inf'), name)
|
| 57 |
-
return sorted(paths, key=key)
|
| 58 |
-
|
| 59 |
-
if __name__ == "__main__":
|
| 60 |
-
parser = argparse.ArgumentParser()
|
| 61 |
-
parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
|
| 62 |
-
parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
|
| 63 |
-
parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
|
| 64 |
-
args = parser.parse_args()
|
| 65 |
-
|
| 66 |
-
## load exist result if have
|
| 67 |
-
save_dir = args.save_dir
|
| 68 |
-
save_dir = path.join(save_dir, path.basename(args.result_dir))
|
| 69 |
-
save_path = path.join(save_dir, "audio_sim.json")
|
| 70 |
-
os.makedirs(save_dir, exist_ok=True)
|
| 71 |
-
if path.exists(save_path):
|
| 72 |
-
with open(save_path, 'r') as f: audio_similarity_list = json.load(f)
|
| 73 |
-
else: audio_similarity_list = []
|
| 74 |
-
|
| 75 |
-
## path
|
| 76 |
-
gt_dir, result_dir = args.gt_dir, args.result_dir
|
| 77 |
-
groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
|
| 78 |
-
result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
|
| 79 |
-
|
| 80 |
-
for index in range(len(audio_similarity_list), 40):
|
| 81 |
-
if path.basename(args.result_dir) == "paper2video":
|
| 82 |
-
p2v_video_path = path.join(result_list[index], "3_merage.mp4")
|
| 83 |
-
elif path.basename(args.result_dir) == "veo3":
|
| 84 |
-
p2v_video_path = path.join(result_list[index])
|
| 85 |
-
else:
|
| 86 |
-
p2v_video_path = path.join(result_list[index], "result.mp4")
|
| 87 |
-
if path.exists(p2v_video_path) is False: continue
|
| 88 |
-
gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
|
| 89 |
-
if path.exists(gt_video_path) is False: continue
|
| 90 |
-
print(p2v_video_path, gt_video_path)
|
| 91 |
-
similarity = get_audio_sim_score(p2v_video_path, gt_video_path)
|
| 92 |
-
audio_similarity_list.append({
|
| 93 |
-
"data_idx": index,
|
| 94 |
-
"score": similarity.item()
|
| 95 |
-
})
|
| 96 |
-
print(audio_similarity_list)
|
| 97 |
-
with open(save_path, 'w') as f: json.dump(audio_similarity_list, f, indent=4)
|
| 98 |
-
|
| 99 |
-
# import numpy as np
|
| 100 |
-
# avg = np.average(similarity_all)
|
| 101 |
-
# var = np.var(similarity_all)
|
| 102 |
-
# print(avg, var)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/MetaSim_content.py
DELETED
|
@@ -1,144 +0,0 @@
|
|
| 1 |
-
import os, re, pdb, json
|
| 2 |
-
from PIL import Image
|
| 3 |
-
import pytesseract
|
| 4 |
-
|
| 5 |
-
import whisperx
|
| 6 |
-
import argparse
|
| 7 |
-
import torch
|
| 8 |
-
import numpy as np
|
| 9 |
-
from os import path
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import List
|
| 12 |
-
from camel.models import ModelFactory
|
| 13 |
-
from camel.types import ModelType, ModelPlatformType
|
| 14 |
-
from camel.configs import GeminiConfig
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
os.environ["GEMINI_API_KEY"] = ""
|
| 18 |
-
prompt_path = "./prompt/content_sim_score.txt"
|
| 19 |
-
|
| 20 |
-
agent_config = {
|
| 21 |
-
"model_type": ModelType.GEMINI_2_5_FLASH,
|
| 22 |
-
"model_config": GeminiConfig().as_dict(),
|
| 23 |
-
"model_platform": ModelPlatformType.GEMINI,}
|
| 24 |
-
actor_model = ModelFactory.create(
|
| 25 |
-
model_platform=agent_config['model_platform'],
|
| 26 |
-
model_type=agent_config['model_type'],
|
| 27 |
-
model_config_dict=agent_config['model_config'],)
|
| 28 |
-
|
| 29 |
-
def extract_slide_texts(slide_dir):
|
| 30 |
-
slide_texts = []
|
| 31 |
-
for fname in sorted(os.listdir(slide_dir)):
|
| 32 |
-
if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
|
| 33 |
-
path = os.path.join(slide_dir, fname)
|
| 34 |
-
text = pytesseract.image_to_string(Image.open(path))
|
| 35 |
-
slide_texts.append(text.strip())
|
| 36 |
-
return slide_texts
|
| 37 |
-
|
| 38 |
-
def load_subtitles(sub_path):
|
| 39 |
-
with open(sub_path, "r") as f:
|
| 40 |
-
lines = f.readlines()
|
| 41 |
-
return [line.strip() for line in lines if line.strip()]
|
| 42 |
-
|
| 43 |
-
def build_prompt(slides_1, subs_1, slides_2, subs_2):
|
| 44 |
-
prompt = (
|
| 45 |
-
"Human Presentation:\n"
|
| 46 |
-
"Slides:\n" + "\n".join(slides_1) + "\n"
|
| 47 |
-
"Subtitles:\n" + "\n".join(subs_1) + "\n\n"
|
| 48 |
-
"Generated Presentation:\n"
|
| 49 |
-
"Slides:\n" + "\n".join(slides_2) + "\n"
|
| 50 |
-
"Subtitles:\n" + "\n".join(subs_2) + "\n\n")
|
| 51 |
-
return prompt
|
| 52 |
-
|
| 53 |
-
def run_similarity_eval(slide_dir_1, slide_dir_2, sub_path_1, sub_path_2):
|
| 54 |
-
slides_1 = extract_slide_texts(slide_dir_1)
|
| 55 |
-
slides_2 = extract_slide_texts(slide_dir_2)
|
| 56 |
-
subs_1 = load_subtitles(sub_path_1)
|
| 57 |
-
subs_2 = load_subtitles(sub_path_2)
|
| 58 |
-
|
| 59 |
-
with open(prompt_path, 'r') as f: prompt = f.readlines()
|
| 60 |
-
prompt = "\n".join(prompt)
|
| 61 |
-
prompt_q = build_prompt(slides_1, subs_1, slides_2, subs_2)
|
| 62 |
-
prompt = prompt + '/n' + prompt_q
|
| 63 |
-
|
| 64 |
-
output = actor_model.run([{"role": "user", "content": prompt}])
|
| 65 |
-
print("=== Similarity Evaluation ===\n")
|
| 66 |
-
print(output.choices[0].message.content)
|
| 67 |
-
return output.choices[0].message.content
|
| 68 |
-
|
| 69 |
-
def extract_plain_subtitle_with_whisperx(video_path: str, output_path: str, model_name: str = "large-v3", language: str = "en"):
|
| 70 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 71 |
-
model = whisperx.load_model(model_name, device=device, language=language)
|
| 72 |
-
|
| 73 |
-
audio = whisperx.load_audio(video_path)
|
| 74 |
-
result = model.transcribe(audio, batch_size=16)
|
| 75 |
-
|
| 76 |
-
with open(output_path, "w") as f:
|
| 77 |
-
for seg in result["segments"]:
|
| 78 |
-
f.write(seg["text"].strip() + "\n")
|
| 79 |
-
|
| 80 |
-
def extract_similarity_scores(text):
|
| 81 |
-
content_match = re.search(r"Content Similarity:\s*(\d+)/5", text)
|
| 82 |
-
if content_match:
|
| 83 |
-
content_score = int(content_match.group(1))
|
| 84 |
-
return content_score
|
| 85 |
-
|
| 86 |
-
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
|
| 87 |
-
def sort_by_leading_number(paths: List[str]) -> List[str]:
|
| 88 |
-
def key(p: str):
|
| 89 |
-
name = Path(p).name
|
| 90 |
-
m = _num_at_start.match(name)
|
| 91 |
-
return (int(m.group(1)) if m else float('inf'), name)
|
| 92 |
-
return sorted(paths, key=key)
|
| 93 |
-
|
| 94 |
-
if __name__ == "__main__":
|
| 95 |
-
parser = argparse.ArgumentParser()
|
| 96 |
-
parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
|
| 97 |
-
parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
|
| 98 |
-
parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
|
| 99 |
-
args = parser.parse_args()
|
| 100 |
-
|
| 101 |
-
## load exist result if have
|
| 102 |
-
save_dir = args.save_dir
|
| 103 |
-
save_dir = path.join(save_dir, path.basename(args.result_dir))
|
| 104 |
-
save_path = path.join(save_dir, "content_sim.json")
|
| 105 |
-
os.makedirs(save_dir, exist_ok=True)
|
| 106 |
-
if path.exists(save_path):
|
| 107 |
-
with open(save_path, 'r') as f: content_sim_list = json.load(f)
|
| 108 |
-
else: content_sim_list = []
|
| 109 |
-
|
| 110 |
-
## path
|
| 111 |
-
gt_dir, result_dir = args.gt_dir, args.result_dir
|
| 112 |
-
groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
|
| 113 |
-
result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
|
| 114 |
-
|
| 115 |
-
## eval
|
| 116 |
-
for index in range(25, 100):
|
| 117 |
-
# video -> subtitle
|
| 118 |
-
if path.basename(args.result_dir) == "paper2video":
|
| 119 |
-
p2v_video_path = path.join(result_list[index], "3_merage.mp4")
|
| 120 |
-
if path.exists(p2v_video_path) is False: continue
|
| 121 |
-
else:
|
| 122 |
-
p2v_video_path = path.join(result_list[index], "result.mp4")
|
| 123 |
-
if path.exists(p2v_video_path) is False: continue
|
| 124 |
-
gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
|
| 125 |
-
extract_plain_subtitle_with_whisperx(gt_video_path, gt_video_path.replace(".mp4", "_sub.txt"))
|
| 126 |
-
extract_plain_subtitle_with_whisperx(p2v_video_path, p2v_video_path.replace(".mp4", "_sub.txt"))
|
| 127 |
-
|
| 128 |
-
# slide dir
|
| 129 |
-
gt_slide_dir = path.join(groundtruth_list[index], "slide_imgs")
|
| 130 |
-
p2v_slide_dir = path.join(result_list[index], "slide_imgs")
|
| 131 |
-
|
| 132 |
-
# eval
|
| 133 |
-
result = run_similarity_eval(
|
| 134 |
-
slide_dir_1=gt_slide_dir,
|
| 135 |
-
slide_dir_2=p2v_slide_dir,
|
| 136 |
-
sub_path_1=gt_video_path.replace(".mp4", "_sub.txt"),
|
| 137 |
-
sub_path_2=p2v_video_path.replace(".mp4", "_sub.txt"))
|
| 138 |
-
content_score = extract_similarity_scores(result)
|
| 139 |
-
content_sim_list.append({
|
| 140 |
-
"data_idx": index,
|
| 141 |
-
"score": content_score
|
| 142 |
-
})
|
| 143 |
-
|
| 144 |
-
with open(save_path, 'w') as f: json.dump(content_sim_list, f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentArena.py
DELETED
|
@@ -1,106 +0,0 @@
|
|
| 1 |
-
'''
|
| 2 |
-
Using VideoLLM (Gemini) as judger
|
| 3 |
-
'''
|
| 4 |
-
import os, re, json
|
| 5 |
-
import time
|
| 6 |
-
import argparse
|
| 7 |
-
import google.generativeai as genai
|
| 8 |
-
from os import path
|
| 9 |
-
from typing import List
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from tqdm import tqdm
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
genai.configure(api_key="")
|
| 15 |
-
def eval_gemini(gt_vid_path, gen_vid_path):
|
| 16 |
-
model = genai.GenerativeModel("models/gemini-2.5-pro")
|
| 17 |
-
gt_vid = genai.upload_file(path=gt_vid_path, mime_type="video/mp4")
|
| 18 |
-
gen_vid = genai.upload_file(path=gen_vid_path, mime_type="video/mp4")
|
| 19 |
-
while True:
|
| 20 |
-
refreshed_1 = genai.get_file(gt_vid.name)
|
| 21 |
-
refreshed_2 = genai.get_file(gen_vid.name)
|
| 22 |
-
if refreshed_1.state.name == "ACTIVE" and refreshed_2.state.name == "ACTIVE": break
|
| 23 |
-
elif refreshed_1.state.name == "FAILED" or refreshed_2.state.name == "FAILED":
|
| 24 |
-
#raise RuntimeError("❌ File processing failed.")
|
| 25 |
-
return None
|
| 26 |
-
else:
|
| 27 |
-
print(f"waiting 5 seconds...")
|
| 28 |
-
time.sleep(5)
|
| 29 |
-
|
| 30 |
-
prompt_path = "./prompt/which_is_better.txt"
|
| 31 |
-
with open(prompt_path, 'r') as f: prompt = f.readlines()
|
| 32 |
-
prompt = "/n".join(prompt)
|
| 33 |
-
print("Sending prompt to Gemini...")
|
| 34 |
-
response = model.generate_content([prompt, refreshed_1, refreshed_2])
|
| 35 |
-
print("\n===== Evaluation Result =====")
|
| 36 |
-
print(response.text)
|
| 37 |
-
print("=============================\n")
|
| 38 |
-
|
| 39 |
-
return response.text
|
| 40 |
-
|
| 41 |
-
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
|
| 42 |
-
def sort_by_leading_number(paths: List[str]) -> List[str]:
|
| 43 |
-
def key(p: str):
|
| 44 |
-
name = Path(p).name
|
| 45 |
-
m = _num_at_start.match(name)
|
| 46 |
-
return (int(m.group(1)) if m else float('inf'), name)
|
| 47 |
-
return sorted(paths, key=key)
|
| 48 |
-
|
| 49 |
-
if __name__ == "__main__":
|
| 50 |
-
parser = argparse.ArgumentParser()
|
| 51 |
-
parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
|
| 52 |
-
parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
|
| 53 |
-
parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
|
| 54 |
-
args = parser.parse_args()
|
| 55 |
-
|
| 56 |
-
## load exist result if have
|
| 57 |
-
save_dir = args.save_dir
|
| 58 |
-
if path.basename(args.result_dir) == "paper2video":
|
| 59 |
-
save_dir = path.join(save_dir, path.basename(args.result_dir))
|
| 60 |
-
else: save_dir = path.join(save_dir, path.basename(args.result_dir))
|
| 61 |
-
|
| 62 |
-
save_path = path.join(save_dir, "video_arena.json")
|
| 63 |
-
os.makedirs(save_dir, exist_ok=True)
|
| 64 |
-
if path.exists(save_path):
|
| 65 |
-
with open(save_path, 'r') as f: arena_score_list = json.load(f)
|
| 66 |
-
else: arena_score_list = []
|
| 67 |
-
|
| 68 |
-
## path
|
| 69 |
-
gt_dir, result_dir = args.gt_dir, args.result_dir
|
| 70 |
-
groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
|
| 71 |
-
result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
|
| 72 |
-
|
| 73 |
-
## Generated v.s GT (1)
|
| 74 |
-
for index in tqdm(len(result_list)):
|
| 75 |
-
if path.basename(args.result_dir) == "paper2video":
|
| 76 |
-
test_video_path = path.join(result_list[index], "3_merage.mp4")
|
| 77 |
-
elif path.basename(args.result_dir) == 'veo3':
|
| 78 |
-
test_video_path = result_list[index]
|
| 79 |
-
else:
|
| 80 |
-
test_video_path = path.join(result_list[index], "result.mp4")
|
| 81 |
-
|
| 82 |
-
if path.exists(test_video_path) is False: continue
|
| 83 |
-
gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
|
| 84 |
-
if path.exists(gt_video_path) is False:
|
| 85 |
-
gt_video_path = path.join(groundtruth_list[index], "raw_video.mp4")
|
| 86 |
-
if path.exists(gt_video_path) is False: continue
|
| 87 |
-
result = eval_gemini(gt_video_path, test_video_path)
|
| 88 |
-
if result is None: continue
|
| 89 |
-
|
| 90 |
-
pat = r"\[(?:A|B)\]"
|
| 91 |
-
m = re.findall(pat, result, flags=re.I)
|
| 92 |
-
score = 0
|
| 93 |
-
if m[0][1] == "B": score += 1
|
| 94 |
-
|
| 95 |
-
result = eval_gemini(test_video_path, gt_video_path)
|
| 96 |
-
if result is None: continue
|
| 97 |
-
|
| 98 |
-
pat = r"\[(?:A|B)\]"
|
| 99 |
-
m = re.findall(pat, result, flags=re.I)
|
| 100 |
-
if m[0][1] == "A": score += 1
|
| 101 |
-
|
| 102 |
-
arena_score_list.append({
|
| 103 |
-
"data_idx": index,
|
| 104 |
-
"score": score/2
|
| 105 |
-
})
|
| 106 |
-
with open(save_path, 'w') as f: json.dump(arena_score_list, f, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/PresentQuiz.py
DELETED
|
@@ -1,264 +0,0 @@
|
|
| 1 |
-
import random
|
| 2 |
-
import string
|
| 3 |
-
import yaml
|
| 4 |
-
import PIL
|
| 5 |
-
import tempfile
|
| 6 |
-
import io
|
| 7 |
-
import argparse
|
| 8 |
-
from os import path
|
| 9 |
-
from camel.models import ModelFactory
|
| 10 |
-
from math import ceil
|
| 11 |
-
from openai import OpenAI
|
| 12 |
-
from camel.messages import BaseMessage
|
| 13 |
-
from utils.src.model_utils import parse_pdf
|
| 14 |
-
from urllib.parse import unquote
|
| 15 |
-
from copy import deepcopy
|
| 16 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 17 |
-
from pytorch_fid.fid_score import compute_statistics_of_path
|
| 18 |
-
import pytorch_fid.fid_score as fid
|
| 19 |
-
from PIL import Image
|
| 20 |
-
from httpx import Timeout
|
| 21 |
-
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 22 |
-
import re
|
| 23 |
-
import shutil
|
| 24 |
-
import pytesseract
|
| 25 |
-
from utils.wei_utils import account_token
|
| 26 |
-
from camel.types import ModelPlatformType, ModelType
|
| 27 |
-
from marker.models import create_model_dict
|
| 28 |
-
from camel.configs import ChatGPTConfig
|
| 29 |
-
from camel.agents import ChatAgent
|
| 30 |
-
from jinja2 import Environment, StrictUndefined
|
| 31 |
-
from utils.src.utils import get_json_from_response
|
| 32 |
-
from pathlib import Path
|
| 33 |
-
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
| 34 |
-
from collections import defaultdict
|
| 35 |
-
from camel.configs import ChatGPTConfig, QwenConfig, VLLMConfig, OpenRouterConfig, GeminiConfig
|
| 36 |
-
|
| 37 |
-
from docling.datamodel.base_models import InputFormat
|
| 38 |
-
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 39 |
-
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 40 |
-
|
| 41 |
-
import math
|
| 42 |
-
import base64
|
| 43 |
-
import requests
|
| 44 |
-
from io import BytesIO
|
| 45 |
-
from PIL import Image
|
| 46 |
-
|
| 47 |
-
import torch
|
| 48 |
-
import json
|
| 49 |
-
import os
|
| 50 |
-
import pickle as pkl
|
| 51 |
-
import numpy as np
|
| 52 |
-
from transformers import AltCLIPProcessor, AltCLIPModel
|
| 53 |
-
from pathlib import Path
|
| 54 |
-
from typing import List
|
| 55 |
-
from moviepy.editor import VideoFileClip
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
os.environ["GEMINI_API_KEY"] = ""
|
| 59 |
-
|
| 60 |
-
def compute_accuracy(predicted, ground_truth, aspects):
|
| 61 |
-
"""
|
| 62 |
-
Parameters
|
| 63 |
-
----------
|
| 64 |
-
predicted : dict
|
| 65 |
-
{question: {'answer': <letter>, 'reference': ...}, ...}
|
| 66 |
-
ground_truth : dict
|
| 67 |
-
{question: '<letter>. full answer', ...}
|
| 68 |
-
aspects : dict
|
| 69 |
-
{question: '<aspect name>', ...}
|
| 70 |
-
|
| 71 |
-
Returns
|
| 72 |
-
-------
|
| 73 |
-
overall_accuracy : float
|
| 74 |
-
aspect_summary : dict
|
| 75 |
-
{
|
| 76 |
-
'<aspect name>': {
|
| 77 |
-
'total': <int>, # questions in this aspect
|
| 78 |
-
'correct': <int>, # correctly answered questions
|
| 79 |
-
'accuracy': <float> # correct / total (0–1)
|
| 80 |
-
},
|
| 81 |
-
...
|
| 82 |
-
}
|
| 83 |
-
"""
|
| 84 |
-
correct_global = 0
|
| 85 |
-
total_global = len(ground_truth)
|
| 86 |
-
|
| 87 |
-
total_by_aspect = defaultdict(int)
|
| 88 |
-
correct_by_aspect = defaultdict(int)
|
| 89 |
-
|
| 90 |
-
for q, pred_info in predicted.items():
|
| 91 |
-
letter_pred = pred_info['answer']
|
| 92 |
-
aspect = aspects.get(q, 'Unknown')
|
| 93 |
-
total_by_aspect[aspect] += 1
|
| 94 |
-
|
| 95 |
-
if q in ground_truth:
|
| 96 |
-
letter_gt = ground_truth[q].split('.')[0].strip()
|
| 97 |
-
|
| 98 |
-
if len(letter_pred) > 0:
|
| 99 |
-
letter_pred = letter_pred[0].upper()
|
| 100 |
-
if letter_pred == letter_gt:
|
| 101 |
-
correct_global += 1
|
| 102 |
-
correct_by_aspect[aspect] += 1
|
| 103 |
-
|
| 104 |
-
overall_accuracy = correct_global / total_global if total_global else 0.0
|
| 105 |
-
|
| 106 |
-
# Build the per-aspect dictionary
|
| 107 |
-
aspect_summary = {}
|
| 108 |
-
for aspect, total in total_by_aspect.items():
|
| 109 |
-
correct = correct_by_aspect[aspect]
|
| 110 |
-
acc = correct / total if total else 0.0
|
| 111 |
-
aspect_summary[aspect] = {
|
| 112 |
-
'total': total,
|
| 113 |
-
'correct': correct,
|
| 114 |
-
'accuracy': acc
|
| 115 |
-
}
|
| 116 |
-
|
| 117 |
-
return overall_accuracy, aspect_summary
|
| 118 |
-
|
| 119 |
-
def eval_qa_get_answer(video_input, questions, answers, aspects, agent_config, input_type='video'):
|
| 120 |
-
agent_name = f'answer_question_from_{input_type}'
|
| 121 |
-
with open(f"prompt/{agent_name}.yaml", "r") as f: config = yaml.safe_load(f)
|
| 122 |
-
|
| 123 |
-
actor_model = ModelFactory.create(
|
| 124 |
-
model_platform=agent_config['model_platform'],
|
| 125 |
-
model_type=agent_config['model_type'],
|
| 126 |
-
model_config_dict=agent_config['model_config'],)
|
| 127 |
-
|
| 128 |
-
actor_sys_msg = config['system_prompt']
|
| 129 |
-
|
| 130 |
-
actor_agent = ChatAgent(system_message=actor_sys_msg, model=actor_model, message_window_size=None,)
|
| 131 |
-
actor_agent.reset()
|
| 132 |
-
|
| 133 |
-
jinja_env = Environment(undefined=StrictUndefined)
|
| 134 |
-
template = jinja_env.from_string(config["template"])
|
| 135 |
-
with open(video_input, "rb") as f: video_bytes = f.read()
|
| 136 |
-
if input_type == 'video':
|
| 137 |
-
prompt = template.render(**{'questions': questions,})
|
| 138 |
-
|
| 139 |
-
clip = VideoFileClip(video_input)
|
| 140 |
-
duration = clip.duration
|
| 141 |
-
msg = BaseMessage.make_user_message(
|
| 142 |
-
role_name="User",
|
| 143 |
-
content=prompt+"The video length is {}, you should NOT reference the timesteps if it exceeds video length".format(str(duration)),
|
| 144 |
-
video_bytes=video_bytes,
|
| 145 |
-
video_detail="low")
|
| 146 |
-
response = actor_agent.step(msg)
|
| 147 |
-
agent_answers = get_json_from_response(response.msgs[0].content)
|
| 148 |
-
|
| 149 |
-
input_token, output_token = account_token(response)
|
| 150 |
-
accuracy, aspect_accuracy = compute_accuracy(agent_answers, answers, aspects)
|
| 151 |
-
return accuracy, aspect_accuracy, agent_answers, input_token, output_token
|
| 152 |
-
|
| 153 |
-
def run_qa_metric(question_path, video_path, result_path, test_model):
|
| 154 |
-
if test_model == "gemini":
|
| 155 |
-
agent_config = {
|
| 156 |
-
"model_type": ModelType.GEMINI_2_5_FLASH,
|
| 157 |
-
"model_config": GeminiConfig().as_dict(),
|
| 158 |
-
"model_platform": ModelPlatformType.GEMINI,
|
| 159 |
-
}
|
| 160 |
-
overall_qa_result = {"qa_result": {}}
|
| 161 |
-
|
| 162 |
-
qa_dict = json.load(open(question_path, 'r'))
|
| 163 |
-
detail_qa, understanding_qa = qa_dict['detail'], qa_dict['understanding']
|
| 164 |
-
input_token_all, output_token_all =0, 0
|
| 165 |
-
detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
|
| 166 |
-
video_input=video_path,
|
| 167 |
-
questions=detail_qa['questions'],
|
| 168 |
-
answers=detail_qa['answers'],
|
| 169 |
-
aspects=detail_qa['aspects'],
|
| 170 |
-
agent_config=agent_config,
|
| 171 |
-
input_type='video')
|
| 172 |
-
input_token_all += input_token
|
| 173 |
-
output_token_all += output_token
|
| 174 |
-
understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
|
| 175 |
-
video_input=video_path,
|
| 176 |
-
questions=understanding_qa['questions'],
|
| 177 |
-
answers=understanding_qa['answers'],
|
| 178 |
-
aspects=understanding_qa['aspects'],
|
| 179 |
-
agent_config=agent_config,
|
| 180 |
-
input_type='video')
|
| 181 |
-
input_token_all += input_token
|
| 182 |
-
output_token_all += output_token
|
| 183 |
-
overall_qa_result['qa_result'][test_model] = {
|
| 184 |
-
'detail_accuracy': detail_accuracy,
|
| 185 |
-
'detail_aspect_accuracy': detail_aspect_accuracy,
|
| 186 |
-
'detail_agent_answers': detail_agent_answers,
|
| 187 |
-
'understanding_accuracy': understanding_accuracy,
|
| 188 |
-
'understanding_aspect_accuracy': understanding_aspect_accuracy,
|
| 189 |
-
'understanding_agent_answers': understanding_agent_answers}
|
| 190 |
-
all_models_in_file = list(overall_qa_result['qa_result'].keys())
|
| 191 |
-
detail_accs = []
|
| 192 |
-
understanding_accs = []
|
| 193 |
-
for m in all_models_in_file:
|
| 194 |
-
detail_accs.append(overall_qa_result['qa_result'][m]['detail_accuracy'])
|
| 195 |
-
understanding_accs.append(overall_qa_result['qa_result'][m]['understanding_accuracy'])
|
| 196 |
-
|
| 197 |
-
avg_detail_accuracy = float(np.mean(detail_accs)) if detail_accs else 0.0
|
| 198 |
-
avg_understanding_accuracy = float(np.mean(understanding_accs)) if understanding_accs else 0.0
|
| 199 |
-
|
| 200 |
-
overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
|
| 201 |
-
overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy
|
| 202 |
-
|
| 203 |
-
# Finally, overwrite the same JSON file with the updated results
|
| 204 |
-
with open(result_path, 'w') as f: json.dump(overall_qa_result, f, indent=4)
|
| 205 |
-
print(detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token)
|
| 206 |
-
|
| 207 |
-
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
|
| 208 |
-
def sort_by_leading_number(paths: List[str]) -> List[str]:
|
| 209 |
-
def key(p: str):
|
| 210 |
-
name = Path(p).name
|
| 211 |
-
m = _num_at_start.match(name)
|
| 212 |
-
return (int(m.group(1)) if m else float('inf'), name)
|
| 213 |
-
return sorted(paths, key=key)
|
| 214 |
-
|
| 215 |
-
if __name__ == "__main__":
|
| 216 |
-
parser = argparse.ArgumentParser()
|
| 217 |
-
parser.add_argument("-r", "--result_dir", default="/path/to/result")
|
| 218 |
-
parser.add_argument("-g", "--data_dir", default="/path/to/data")
|
| 219 |
-
parser.add_argument("-s", "--save_dir", default="/path/to/data")
|
| 220 |
-
args = parser.parse_args()
|
| 221 |
-
## mkdirs
|
| 222 |
-
save_dir = args.save_dir
|
| 223 |
-
if path.basename(args.result_dir) == "paper2video":
|
| 224 |
-
save_dir = path.join(save_dir, path.basename(args.result_dir))
|
| 225 |
-
else: save_dir = path.join(save_dir, path.basename(args.result_dir))
|
| 226 |
-
|
| 227 |
-
save_path = path.join(save_dir, "qa_result")
|
| 228 |
-
os.makedirs(save_dir, exist_ok=True)
|
| 229 |
-
os.makedirs(save_path, exist_ok=True)
|
| 230 |
-
|
| 231 |
-
## run test
|
| 232 |
-
gt_dir, result_dir = args.data_dir, args.result_dir
|
| 233 |
-
groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
|
| 234 |
-
if path.basename(args.result_dir) == "human_made": result_list = [] # from dataset
|
| 235 |
-
else: result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
|
| 236 |
-
|
| 237 |
-
start, end = 1, 100
|
| 238 |
-
for index in range(start, end):
|
| 239 |
-
qa_json_path = path.join(groundtruth_list[index], "4o-mini_qa.json")
|
| 240 |
-
|
| 241 |
-
## paper2video
|
| 242 |
-
if path.basename(args.result_dir) == 'paper2video':
|
| 243 |
-
if without_presenter_flag is False:
|
| 244 |
-
test_video_path = path.join(result_list[index], "3_merage.mp4")
|
| 245 |
-
else:
|
| 246 |
-
test_video_path = path.join(result_list[index], "1_merage.mp4")
|
| 247 |
-
if path.exists(test_video_path) is False: continue
|
| 248 |
-
## human made as baseline
|
| 249 |
-
elif path.basename(args.result_dir) == 'human_made':
|
| 250 |
-
test_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
|
| 251 |
-
if path.exists(test_video_path) is False:
|
| 252 |
-
test_video_path = path.join(groundtruth_list[index], "raw_video.mp4")
|
| 253 |
-
## veo3
|
| 254 |
-
elif path.basename(args.result_dir) == 'veo3':
|
| 255 |
-
test_video_path = result_list[index]
|
| 256 |
-
elif path.basename(args.result_dir) == 'wan2.1':
|
| 257 |
-
test_video_path = path.join(result_list[index], "result.mp4")
|
| 258 |
-
## presentagent
|
| 259 |
-
else:
|
| 260 |
-
test_video_path = path.join(result_list[index], "result.mp4")
|
| 261 |
-
if path.exists(test_video_path) is False: continue
|
| 262 |
-
result_save_path = path.join(save_path, "qa_result_{}.json".format(index))
|
| 263 |
-
print("start")
|
| 264 |
-
run_qa_metric(qa_json_path, test_video_path, result_save_path, 'gemini')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/create_paper_questions.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
from utils.poster_eval_utils import *
|
| 2 |
-
import argparse
|
| 3 |
-
import os
|
| 4 |
-
import json
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
os.environ["OPENAI_API_KEY"] = ""
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
if __name__ == '__main__':
|
| 11 |
-
parser = argparse.ArgumentParser()
|
| 12 |
-
parser.add_argument('--paper_folder', type=str, default="path/to/data")
|
| 13 |
-
parser.add_argument('--model_name', type=str, default='4o')
|
| 14 |
-
args = parser.parse_args()
|
| 15 |
-
|
| 16 |
-
paper_text = get_poster_text(os.path.join(args.paper_folder, 'pdf', 'paper.pdf'))
|
| 17 |
-
|
| 18 |
-
if args.model_name == '4o':
|
| 19 |
-
model_type = ModelType.GPT_4O
|
| 20 |
-
elif args.model_name == 'o3':
|
| 21 |
-
model_type = ModelType.O3
|
| 22 |
-
elif args.model_name == 'gemini':
|
| 23 |
-
model_type = ModelType.GEMINI_2_5_PRO
|
| 24 |
-
|
| 25 |
-
detail_qa = get_questions(paper_text, 'detail', model_type)
|
| 26 |
-
understanding_qa = get_questions(paper_text, 'understanding', model_type)
|
| 27 |
-
|
| 28 |
-
detail_q, detail_a, detail_aspects = get_answers_and_remove_answers(detail_qa)
|
| 29 |
-
understanding_q, understanding_a, understanding_aspects = get_answers_and_remove_answers(understanding_qa)
|
| 30 |
-
|
| 31 |
-
final_qa = {}
|
| 32 |
-
detail_qa = {
|
| 33 |
-
'questions': detail_q,
|
| 34 |
-
'answers': detail_a,
|
| 35 |
-
'aspects': detail_aspects,
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
understanding_qa = {
|
| 39 |
-
'questions': understanding_q,
|
| 40 |
-
'answers': understanding_a,
|
| 41 |
-
'aspects': understanding_aspects,
|
| 42 |
-
}
|
| 43 |
-
final_qa['detail'] = detail_qa
|
| 44 |
-
final_qa['understanding'] = understanding_qa
|
| 45 |
-
|
| 46 |
-
with open(os.path.join(args.paper_folder, f'{args.model_name}_qa.json'), 'w') as f:
|
| 47 |
-
json.dump(final_qa, f, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/abstract_backend.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
from abc import ABC, abstractmethod
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import TYPE_CHECKING, Set, Union
|
| 5 |
-
|
| 6 |
-
from docling_core.types.doc import DoclingDocument
|
| 7 |
-
|
| 8 |
-
if TYPE_CHECKING:
|
| 9 |
-
from docling.datamodel.base_models import InputFormat
|
| 10 |
-
from docling.datamodel.document import InputDocument
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class AbstractDocumentBackend(ABC):
|
| 14 |
-
@abstractmethod
|
| 15 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 16 |
-
self.file = in_doc.file
|
| 17 |
-
self.path_or_stream = path_or_stream
|
| 18 |
-
self.document_hash = in_doc.document_hash
|
| 19 |
-
self.input_format = in_doc.format
|
| 20 |
-
|
| 21 |
-
@abstractmethod
|
| 22 |
-
def is_valid(self) -> bool:
|
| 23 |
-
pass
|
| 24 |
-
|
| 25 |
-
@classmethod
|
| 26 |
-
@abstractmethod
|
| 27 |
-
def supports_pagination(cls) -> bool:
|
| 28 |
-
pass
|
| 29 |
-
|
| 30 |
-
def unload(self):
|
| 31 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 32 |
-
self.path_or_stream.close()
|
| 33 |
-
|
| 34 |
-
self.path_or_stream = None
|
| 35 |
-
|
| 36 |
-
@classmethod
|
| 37 |
-
@abstractmethod
|
| 38 |
-
def supported_formats(cls) -> Set["InputFormat"]:
|
| 39 |
-
pass
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
class PaginatedDocumentBackend(AbstractDocumentBackend):
|
| 43 |
-
"""DeclarativeDocumentBackend.
|
| 44 |
-
|
| 45 |
-
A declarative document backend is a backend that can transform to DoclingDocument
|
| 46 |
-
straight without a recognition pipeline.
|
| 47 |
-
"""
|
| 48 |
-
|
| 49 |
-
@abstractmethod
|
| 50 |
-
def page_count(self) -> int:
|
| 51 |
-
pass
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
| 55 |
-
"""DeclarativeDocumentBackend.
|
| 56 |
-
|
| 57 |
-
A declarative document backend is a backend that can transform to DoclingDocument
|
| 58 |
-
straight without a recognition pipeline.
|
| 59 |
-
"""
|
| 60 |
-
|
| 61 |
-
@abstractmethod
|
| 62 |
-
def convert(self) -> DoclingDocument:
|
| 63 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/asciidoc_backend.py
DELETED
|
@@ -1,430 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import re
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Set, Union
|
| 6 |
-
|
| 7 |
-
from docling_core.types.doc import (
|
| 8 |
-
DocItemLabel,
|
| 9 |
-
DoclingDocument,
|
| 10 |
-
DocumentOrigin,
|
| 11 |
-
GroupItem,
|
| 12 |
-
GroupLabel,
|
| 13 |
-
ImageRef,
|
| 14 |
-
Size,
|
| 15 |
-
TableCell,
|
| 16 |
-
TableData,
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 20 |
-
from docling.datamodel.base_models import InputFormat
|
| 21 |
-
from docling.datamodel.document import InputDocument
|
| 22 |
-
|
| 23 |
-
_log = logging.getLogger(__name__)
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
class AsciiDocBackend(DeclarativeDocumentBackend):
|
| 27 |
-
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
| 28 |
-
super().__init__(in_doc, path_or_stream)
|
| 29 |
-
|
| 30 |
-
self.path_or_stream = path_or_stream
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 34 |
-
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
| 35 |
-
self.lines = text_stream.split("\n")
|
| 36 |
-
if isinstance(self.path_or_stream, Path):
|
| 37 |
-
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
| 38 |
-
self.lines = f.readlines()
|
| 39 |
-
self.valid = True
|
| 40 |
-
|
| 41 |
-
except Exception as e:
|
| 42 |
-
raise RuntimeError(
|
| 43 |
-
f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
|
| 44 |
-
) from e
|
| 45 |
-
return
|
| 46 |
-
|
| 47 |
-
def is_valid(self) -> bool:
|
| 48 |
-
return self.valid
|
| 49 |
-
|
| 50 |
-
@classmethod
|
| 51 |
-
def supports_pagination(cls) -> bool:
|
| 52 |
-
return False
|
| 53 |
-
|
| 54 |
-
def unload(self):
|
| 55 |
-
return
|
| 56 |
-
|
| 57 |
-
@classmethod
|
| 58 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 59 |
-
return {InputFormat.ASCIIDOC}
|
| 60 |
-
|
| 61 |
-
def convert(self) -> DoclingDocument:
|
| 62 |
-
"""
|
| 63 |
-
Parses the ASCII into a structured document model.
|
| 64 |
-
"""
|
| 65 |
-
|
| 66 |
-
origin = DocumentOrigin(
|
| 67 |
-
filename=self.file.name or "file",
|
| 68 |
-
mimetype="text/asciidoc",
|
| 69 |
-
binary_hash=self.document_hash,
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
| 73 |
-
|
| 74 |
-
doc = self._parse(doc)
|
| 75 |
-
|
| 76 |
-
return doc
|
| 77 |
-
|
| 78 |
-
def _parse(self, doc: DoclingDocument):
|
| 79 |
-
"""
|
| 80 |
-
Main function that orchestrates the parsing by yielding components:
|
| 81 |
-
title, section headers, text, lists, and tables.
|
| 82 |
-
"""
|
| 83 |
-
|
| 84 |
-
content = ""
|
| 85 |
-
|
| 86 |
-
in_list = False
|
| 87 |
-
in_table = False
|
| 88 |
-
|
| 89 |
-
text_data: list[str] = []
|
| 90 |
-
table_data: list[str] = []
|
| 91 |
-
caption_data: list[str] = []
|
| 92 |
-
|
| 93 |
-
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
| 94 |
-
parents: dict[int, Union[GroupItem, None]] = {}
|
| 95 |
-
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
| 96 |
-
indents: dict[int, Union[GroupItem, None]] = {}
|
| 97 |
-
|
| 98 |
-
for i in range(0, 10):
|
| 99 |
-
parents[i] = None
|
| 100 |
-
indents[i] = None
|
| 101 |
-
|
| 102 |
-
for line in self.lines:
|
| 103 |
-
# line = line.strip()
|
| 104 |
-
|
| 105 |
-
# Title
|
| 106 |
-
if self._is_title(line):
|
| 107 |
-
item = self._parse_title(line)
|
| 108 |
-
level = item["level"]
|
| 109 |
-
|
| 110 |
-
parents[level] = doc.add_text(
|
| 111 |
-
text=item["text"], label=DocItemLabel.TITLE
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
# Section headers
|
| 115 |
-
elif self._is_section_header(line):
|
| 116 |
-
item = self._parse_section_header(line)
|
| 117 |
-
level = item["level"]
|
| 118 |
-
|
| 119 |
-
parents[level] = doc.add_heading(
|
| 120 |
-
text=item["text"], level=item["level"], parent=parents[level - 1]
|
| 121 |
-
)
|
| 122 |
-
for k, v in parents.items():
|
| 123 |
-
if k > level:
|
| 124 |
-
parents[k] = None
|
| 125 |
-
|
| 126 |
-
# Lists
|
| 127 |
-
elif self._is_list_item(line):
|
| 128 |
-
|
| 129 |
-
_log.debug(f"line: {line}")
|
| 130 |
-
item = self._parse_list_item(line)
|
| 131 |
-
_log.debug(f"parsed list-item: {item}")
|
| 132 |
-
|
| 133 |
-
level = self._get_current_level(parents)
|
| 134 |
-
|
| 135 |
-
if not in_list:
|
| 136 |
-
in_list = True
|
| 137 |
-
|
| 138 |
-
parents[level + 1] = doc.add_group(
|
| 139 |
-
parent=parents[level], name="list", label=GroupLabel.LIST
|
| 140 |
-
)
|
| 141 |
-
indents[level + 1] = item["indent"]
|
| 142 |
-
|
| 143 |
-
elif in_list and item["indent"] > indents[level]:
|
| 144 |
-
parents[level + 1] = doc.add_group(
|
| 145 |
-
parent=parents[level], name="list", label=GroupLabel.LIST
|
| 146 |
-
)
|
| 147 |
-
indents[level + 1] = item["indent"]
|
| 148 |
-
|
| 149 |
-
elif in_list and item["indent"] < indents[level]:
|
| 150 |
-
|
| 151 |
-
# print(item["indent"], " => ", indents[level])
|
| 152 |
-
while item["indent"] < indents[level]:
|
| 153 |
-
# print(item["indent"], " => ", indents[level])
|
| 154 |
-
parents[level] = None
|
| 155 |
-
indents[level] = None
|
| 156 |
-
level -= 1
|
| 157 |
-
|
| 158 |
-
doc.add_list_item(
|
| 159 |
-
item["text"], parent=self._get_current_parent(parents)
|
| 160 |
-
)
|
| 161 |
-
|
| 162 |
-
elif in_list and not self._is_list_item(line):
|
| 163 |
-
in_list = False
|
| 164 |
-
|
| 165 |
-
level = self._get_current_level(parents)
|
| 166 |
-
parents[level] = None
|
| 167 |
-
|
| 168 |
-
# Tables
|
| 169 |
-
elif line.strip() == "|===" and not in_table: # start of table
|
| 170 |
-
in_table = True
|
| 171 |
-
|
| 172 |
-
elif self._is_table_line(line): # within a table
|
| 173 |
-
in_table = True
|
| 174 |
-
table_data.append(self._parse_table_line(line))
|
| 175 |
-
|
| 176 |
-
elif in_table and (
|
| 177 |
-
(not self._is_table_line(line)) or line.strip() == "|==="
|
| 178 |
-
): # end of table
|
| 179 |
-
|
| 180 |
-
caption = None
|
| 181 |
-
if len(caption_data) > 0:
|
| 182 |
-
caption = doc.add_text(
|
| 183 |
-
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
-
caption_data = []
|
| 187 |
-
|
| 188 |
-
data = self._populate_table_as_grid(table_data)
|
| 189 |
-
doc.add_table(
|
| 190 |
-
data=data, parent=self._get_current_parent(parents), caption=caption
|
| 191 |
-
)
|
| 192 |
-
|
| 193 |
-
in_table = False
|
| 194 |
-
table_data = []
|
| 195 |
-
|
| 196 |
-
# Picture
|
| 197 |
-
elif self._is_picture(line):
|
| 198 |
-
|
| 199 |
-
caption = None
|
| 200 |
-
if len(caption_data) > 0:
|
| 201 |
-
caption = doc.add_text(
|
| 202 |
-
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
| 203 |
-
)
|
| 204 |
-
|
| 205 |
-
caption_data = []
|
| 206 |
-
|
| 207 |
-
item = self._parse_picture(line)
|
| 208 |
-
|
| 209 |
-
size = None
|
| 210 |
-
if "width" in item and "height" in item:
|
| 211 |
-
size = Size(width=int(item["width"]), height=int(item["height"]))
|
| 212 |
-
|
| 213 |
-
uri = None
|
| 214 |
-
if (
|
| 215 |
-
"uri" in item
|
| 216 |
-
and not item["uri"].startswith("http")
|
| 217 |
-
and item["uri"].startswith("//")
|
| 218 |
-
):
|
| 219 |
-
uri = "file:" + item["uri"]
|
| 220 |
-
elif (
|
| 221 |
-
"uri" in item
|
| 222 |
-
and not item["uri"].startswith("http")
|
| 223 |
-
and item["uri"].startswith("/")
|
| 224 |
-
):
|
| 225 |
-
uri = "file:/" + item["uri"]
|
| 226 |
-
elif "uri" in item and not item["uri"].startswith("http"):
|
| 227 |
-
uri = "file://" + item["uri"]
|
| 228 |
-
|
| 229 |
-
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
|
| 230 |
-
doc.add_picture(image=image, caption=caption)
|
| 231 |
-
|
| 232 |
-
# Caption
|
| 233 |
-
elif self._is_caption(line) and len(caption_data) == 0:
|
| 234 |
-
item = self._parse_caption(line)
|
| 235 |
-
caption_data.append(item["text"])
|
| 236 |
-
|
| 237 |
-
elif (
|
| 238 |
-
len(line.strip()) > 0 and len(caption_data) > 0
|
| 239 |
-
): # allow multiline captions
|
| 240 |
-
item = self._parse_text(line)
|
| 241 |
-
caption_data.append(item["text"])
|
| 242 |
-
|
| 243 |
-
# Plain text
|
| 244 |
-
elif len(line.strip()) == 0 and len(text_data) > 0:
|
| 245 |
-
doc.add_text(
|
| 246 |
-
text=" ".join(text_data),
|
| 247 |
-
label=DocItemLabel.PARAGRAPH,
|
| 248 |
-
parent=self._get_current_parent(parents),
|
| 249 |
-
)
|
| 250 |
-
text_data = []
|
| 251 |
-
|
| 252 |
-
elif len(line.strip()) > 0: # allow multiline texts
|
| 253 |
-
|
| 254 |
-
item = self._parse_text(line)
|
| 255 |
-
text_data.append(item["text"])
|
| 256 |
-
|
| 257 |
-
if len(text_data) > 0:
|
| 258 |
-
doc.add_text(
|
| 259 |
-
text=" ".join(text_data),
|
| 260 |
-
label=DocItemLabel.PARAGRAPH,
|
| 261 |
-
parent=self._get_current_parent(parents),
|
| 262 |
-
)
|
| 263 |
-
text_data = []
|
| 264 |
-
|
| 265 |
-
if in_table and len(table_data) > 0:
|
| 266 |
-
data = self._populate_table_as_grid(table_data)
|
| 267 |
-
doc.add_table(data=data, parent=self._get_current_parent(parents))
|
| 268 |
-
|
| 269 |
-
in_table = False
|
| 270 |
-
table_data = []
|
| 271 |
-
|
| 272 |
-
return doc
|
| 273 |
-
|
| 274 |
-
def _get_current_level(self, parents):
|
| 275 |
-
for k, v in parents.items():
|
| 276 |
-
if v == None and k > 0:
|
| 277 |
-
return k - 1
|
| 278 |
-
|
| 279 |
-
return 0
|
| 280 |
-
|
| 281 |
-
def _get_current_parent(self, parents):
|
| 282 |
-
for k, v in parents.items():
|
| 283 |
-
if v == None and k > 0:
|
| 284 |
-
return parents[k - 1]
|
| 285 |
-
|
| 286 |
-
return None
|
| 287 |
-
|
| 288 |
-
# ========= Title
|
| 289 |
-
def _is_title(self, line):
|
| 290 |
-
return re.match(r"^= ", line)
|
| 291 |
-
|
| 292 |
-
def _parse_title(self, line):
|
| 293 |
-
return {"type": "title", "text": line[2:].strip(), "level": 0}
|
| 294 |
-
|
| 295 |
-
# ========= Section headers
|
| 296 |
-
def _is_section_header(self, line):
|
| 297 |
-
return re.match(r"^==+", line)
|
| 298 |
-
|
| 299 |
-
def _parse_section_header(self, line):
|
| 300 |
-
match = re.match(r"^(=+)\s+(.*)", line)
|
| 301 |
-
|
| 302 |
-
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
| 303 |
-
text = match.group(2) # The actual text of the list item
|
| 304 |
-
|
| 305 |
-
header_level = marker.count("=") # number of '=' represents level
|
| 306 |
-
return {
|
| 307 |
-
"type": "header",
|
| 308 |
-
"level": header_level - 1,
|
| 309 |
-
"text": text.strip(),
|
| 310 |
-
}
|
| 311 |
-
|
| 312 |
-
# ========= Lists
|
| 313 |
-
def _is_list_item(self, line):
|
| 314 |
-
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
|
| 315 |
-
|
| 316 |
-
def _parse_list_item(self, line):
|
| 317 |
-
"""Extract the item marker (number or bullet symbol) and the text of the item."""
|
| 318 |
-
|
| 319 |
-
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
|
| 320 |
-
if match:
|
| 321 |
-
indent = match.group(1)
|
| 322 |
-
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
|
| 323 |
-
text = match.group(3) # The actual text of the list item
|
| 324 |
-
|
| 325 |
-
if marker == "*" or marker == "-":
|
| 326 |
-
return {
|
| 327 |
-
"type": "list_item",
|
| 328 |
-
"marker": marker,
|
| 329 |
-
"text": text.strip(),
|
| 330 |
-
"numbered": False,
|
| 331 |
-
"indent": 0 if indent == None else len(indent),
|
| 332 |
-
}
|
| 333 |
-
else:
|
| 334 |
-
return {
|
| 335 |
-
"type": "list_item",
|
| 336 |
-
"marker": marker,
|
| 337 |
-
"text": text.strip(),
|
| 338 |
-
"numbered": True,
|
| 339 |
-
"indent": 0 if indent == None else len(indent),
|
| 340 |
-
}
|
| 341 |
-
else:
|
| 342 |
-
# Fallback if no match
|
| 343 |
-
return {
|
| 344 |
-
"type": "list_item",
|
| 345 |
-
"marker": "-",
|
| 346 |
-
"text": line,
|
| 347 |
-
"numbered": False,
|
| 348 |
-
"indent": 0,
|
| 349 |
-
}
|
| 350 |
-
|
| 351 |
-
# ========= Tables
|
| 352 |
-
def _is_table_line(self, line):
|
| 353 |
-
return re.match(r"^\|.*\|", line)
|
| 354 |
-
|
| 355 |
-
def _parse_table_line(self, line):
|
| 356 |
-
# Split table cells and trim extra spaces
|
| 357 |
-
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
| 358 |
-
|
| 359 |
-
def _populate_table_as_grid(self, table_data):
|
| 360 |
-
|
| 361 |
-
num_rows = len(table_data)
|
| 362 |
-
|
| 363 |
-
# Adjust the table data into a grid format
|
| 364 |
-
num_cols = max(len(row) for row in table_data)
|
| 365 |
-
|
| 366 |
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
| 367 |
-
for row_idx, row in enumerate(table_data):
|
| 368 |
-
# Pad rows with empty strings to match column count
|
| 369 |
-
# grid.append(row + [''] * (max_cols - len(row)))
|
| 370 |
-
|
| 371 |
-
for col_idx, text in enumerate(row):
|
| 372 |
-
row_span = 1
|
| 373 |
-
col_span = 1
|
| 374 |
-
|
| 375 |
-
cell = TableCell(
|
| 376 |
-
text=text,
|
| 377 |
-
row_span=row_span,
|
| 378 |
-
col_span=col_span,
|
| 379 |
-
start_row_offset_idx=row_idx,
|
| 380 |
-
end_row_offset_idx=row_idx + row_span,
|
| 381 |
-
start_col_offset_idx=col_idx,
|
| 382 |
-
end_col_offset_idx=col_idx + col_span,
|
| 383 |
-
col_header=False,
|
| 384 |
-
row_header=False,
|
| 385 |
-
)
|
| 386 |
-
data.table_cells.append(cell)
|
| 387 |
-
|
| 388 |
-
return data
|
| 389 |
-
|
| 390 |
-
# ========= Pictures
|
| 391 |
-
def _is_picture(self, line):
|
| 392 |
-
return re.match(r"^image::", line)
|
| 393 |
-
|
| 394 |
-
def _parse_picture(self, line):
|
| 395 |
-
"""
|
| 396 |
-
Parse an image macro, extracting its path and attributes.
|
| 397 |
-
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
|
| 398 |
-
"""
|
| 399 |
-
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
| 400 |
-
if mtch:
|
| 401 |
-
picture_path = mtch.group(1).strip()
|
| 402 |
-
attributes = mtch.group(2).split(",")
|
| 403 |
-
picture_info = {"type": "picture", "uri": picture_path}
|
| 404 |
-
|
| 405 |
-
# Extract optional attributes (alt text, width, height, alignment)
|
| 406 |
-
if attributes:
|
| 407 |
-
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
| 408 |
-
for attr in attributes[1:]:
|
| 409 |
-
key, value = attr.split("=")
|
| 410 |
-
picture_info[key.strip()] = value.strip()
|
| 411 |
-
|
| 412 |
-
return picture_info
|
| 413 |
-
|
| 414 |
-
return {"type": "picture", "uri": line}
|
| 415 |
-
|
| 416 |
-
# ========= Captions
|
| 417 |
-
def _is_caption(self, line):
|
| 418 |
-
return re.match(r"^\.(.+)", line)
|
| 419 |
-
|
| 420 |
-
def _parse_caption(self, line):
|
| 421 |
-
mtch = re.match(r"^\.(.+)", line)
|
| 422 |
-
if mtch:
|
| 423 |
-
text = mtch.group(1)
|
| 424 |
-
return {"type": "caption", "text": text}
|
| 425 |
-
|
| 426 |
-
return {"type": "caption", "text": ""}
|
| 427 |
-
|
| 428 |
-
# ========= Plain text
|
| 429 |
-
def _parse_text(self, line):
|
| 430 |
-
return {"type": "text", "text": line.strip()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_backend.py
DELETED
|
@@ -1,227 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import random
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Iterable, List, Optional, Union
|
| 6 |
-
|
| 7 |
-
import pypdfium2 as pdfium
|
| 8 |
-
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
| 9 |
-
from docling_parse.pdf_parsers import pdf_parser_v1
|
| 10 |
-
from PIL import Image, ImageDraw
|
| 11 |
-
from pypdfium2 import PdfPage
|
| 12 |
-
|
| 13 |
-
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
| 14 |
-
from docling.datamodel.base_models import Cell
|
| 15 |
-
from docling.datamodel.document import InputDocument
|
| 16 |
-
|
| 17 |
-
_log = logging.getLogger(__name__)
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class DoclingParsePageBackend(PdfPageBackend):
|
| 21 |
-
def __init__(
|
| 22 |
-
self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
|
| 23 |
-
):
|
| 24 |
-
self._ppage = page_obj
|
| 25 |
-
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
| 26 |
-
|
| 27 |
-
self.valid = "pages" in parsed_page
|
| 28 |
-
if self.valid:
|
| 29 |
-
self._dpage = parsed_page["pages"][0]
|
| 30 |
-
else:
|
| 31 |
-
_log.info(
|
| 32 |
-
f"An error occurred when loading page {page_no} of document {document_hash}."
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
def is_valid(self) -> bool:
|
| 36 |
-
return self.valid
|
| 37 |
-
|
| 38 |
-
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
| 39 |
-
if not self.valid:
|
| 40 |
-
return ""
|
| 41 |
-
# Find intersecting cells on the page
|
| 42 |
-
text_piece = ""
|
| 43 |
-
page_size = self.get_size()
|
| 44 |
-
parser_width = self._dpage["width"]
|
| 45 |
-
parser_height = self._dpage["height"]
|
| 46 |
-
|
| 47 |
-
scale = (
|
| 48 |
-
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
for i in range(len(self._dpage["cells"])):
|
| 52 |
-
rect = self._dpage["cells"][i]["box"]["device"]
|
| 53 |
-
x0, y0, x1, y1 = rect
|
| 54 |
-
cell_bbox = BoundingBox(
|
| 55 |
-
l=x0 * scale * page_size.width / parser_width,
|
| 56 |
-
b=y0 * scale * page_size.height / parser_height,
|
| 57 |
-
r=x1 * scale * page_size.width / parser_width,
|
| 58 |
-
t=y1 * scale * page_size.height / parser_height,
|
| 59 |
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
| 60 |
-
).to_top_left_origin(page_height=page_size.height * scale)
|
| 61 |
-
|
| 62 |
-
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
| 63 |
-
|
| 64 |
-
if overlap_frac > 0.5:
|
| 65 |
-
if len(text_piece) > 0:
|
| 66 |
-
text_piece += " "
|
| 67 |
-
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
| 68 |
-
|
| 69 |
-
return text_piece
|
| 70 |
-
|
| 71 |
-
def get_text_cells(self) -> Iterable[Cell]:
|
| 72 |
-
cells: List[Cell] = []
|
| 73 |
-
cell_counter = 0
|
| 74 |
-
|
| 75 |
-
if not self.valid:
|
| 76 |
-
return cells
|
| 77 |
-
|
| 78 |
-
page_size = self.get_size()
|
| 79 |
-
|
| 80 |
-
parser_width = self._dpage["width"]
|
| 81 |
-
parser_height = self._dpage["height"]
|
| 82 |
-
|
| 83 |
-
for i in range(len(self._dpage["cells"])):
|
| 84 |
-
rect = self._dpage["cells"][i]["box"]["device"]
|
| 85 |
-
x0, y0, x1, y1 = rect
|
| 86 |
-
|
| 87 |
-
if x1 < x0:
|
| 88 |
-
x0, x1 = x1, x0
|
| 89 |
-
if y1 < y0:
|
| 90 |
-
y0, y1 = y1, y0
|
| 91 |
-
|
| 92 |
-
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
| 93 |
-
cells.append(
|
| 94 |
-
Cell(
|
| 95 |
-
id=cell_counter,
|
| 96 |
-
text=text_piece,
|
| 97 |
-
bbox=BoundingBox(
|
| 98 |
-
# l=x0, b=y0, r=x1, t=y1,
|
| 99 |
-
l=x0 * page_size.width / parser_width,
|
| 100 |
-
b=y0 * page_size.height / parser_height,
|
| 101 |
-
r=x1 * page_size.width / parser_width,
|
| 102 |
-
t=y1 * page_size.height / parser_height,
|
| 103 |
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
| 104 |
-
).to_top_left_origin(page_size.height),
|
| 105 |
-
)
|
| 106 |
-
)
|
| 107 |
-
cell_counter += 1
|
| 108 |
-
|
| 109 |
-
def draw_clusters_and_cells():
|
| 110 |
-
image = (
|
| 111 |
-
self.get_page_image()
|
| 112 |
-
) # make new image to avoid drawing on the saved ones
|
| 113 |
-
draw = ImageDraw.Draw(image)
|
| 114 |
-
for c in cells:
|
| 115 |
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
| 116 |
-
cell_color = (
|
| 117 |
-
random.randint(30, 140),
|
| 118 |
-
random.randint(30, 140),
|
| 119 |
-
random.randint(30, 140),
|
| 120 |
-
)
|
| 121 |
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
| 122 |
-
image.show()
|
| 123 |
-
|
| 124 |
-
# before merge:
|
| 125 |
-
# draw_clusters_and_cells()
|
| 126 |
-
|
| 127 |
-
# cells = merge_horizontal_cells(cells)
|
| 128 |
-
|
| 129 |
-
# after merge:
|
| 130 |
-
# draw_clusters_and_cells()
|
| 131 |
-
|
| 132 |
-
return cells
|
| 133 |
-
|
| 134 |
-
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
| 135 |
-
AREA_THRESHOLD = 0 # 32 * 32
|
| 136 |
-
|
| 137 |
-
for i in range(len(self._dpage["images"])):
|
| 138 |
-
bitmap = self._dpage["images"][i]
|
| 139 |
-
cropbox = BoundingBox.from_tuple(
|
| 140 |
-
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
|
| 141 |
-
).to_top_left_origin(self.get_size().height)
|
| 142 |
-
|
| 143 |
-
if cropbox.area() > AREA_THRESHOLD:
|
| 144 |
-
cropbox = cropbox.scaled(scale=scale)
|
| 145 |
-
|
| 146 |
-
yield cropbox
|
| 147 |
-
|
| 148 |
-
def get_page_image(
|
| 149 |
-
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
| 150 |
-
) -> Image.Image:
|
| 151 |
-
|
| 152 |
-
page_size = self.get_size()
|
| 153 |
-
|
| 154 |
-
if not cropbox:
|
| 155 |
-
cropbox = BoundingBox(
|
| 156 |
-
l=0,
|
| 157 |
-
r=page_size.width,
|
| 158 |
-
t=0,
|
| 159 |
-
b=page_size.height,
|
| 160 |
-
coord_origin=CoordOrigin.TOPLEFT,
|
| 161 |
-
)
|
| 162 |
-
padbox = BoundingBox(
|
| 163 |
-
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
| 164 |
-
)
|
| 165 |
-
else:
|
| 166 |
-
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
| 167 |
-
padbox.r = page_size.width - padbox.r
|
| 168 |
-
padbox.t = page_size.height - padbox.t
|
| 169 |
-
|
| 170 |
-
image = (
|
| 171 |
-
self._ppage.render(
|
| 172 |
-
scale=scale * 1.5,
|
| 173 |
-
rotation=0, # no additional rotation
|
| 174 |
-
crop=padbox.as_tuple(),
|
| 175 |
-
)
|
| 176 |
-
.to_pil()
|
| 177 |
-
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
| 178 |
-
) # We resize the image from 1.5x the given scale to make it sharper.
|
| 179 |
-
|
| 180 |
-
return image
|
| 181 |
-
|
| 182 |
-
def get_size(self) -> Size:
|
| 183 |
-
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
| 184 |
-
|
| 185 |
-
def unload(self):
|
| 186 |
-
self._ppage = None
|
| 187 |
-
self._dpage = None
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
| 191 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 192 |
-
super().__init__(in_doc, path_or_stream)
|
| 193 |
-
|
| 194 |
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
| 195 |
-
self.parser = pdf_parser_v1()
|
| 196 |
-
|
| 197 |
-
success = False
|
| 198 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 199 |
-
success = self.parser.load_document_from_bytesio(
|
| 200 |
-
self.document_hash, self.path_or_stream
|
| 201 |
-
)
|
| 202 |
-
elif isinstance(self.path_or_stream, Path):
|
| 203 |
-
success = self.parser.load_document(
|
| 204 |
-
self.document_hash, str(self.path_or_stream)
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
if not success:
|
| 208 |
-
raise RuntimeError(
|
| 209 |
-
f"docling-parse could not load document with hash {self.document_hash}."
|
| 210 |
-
)
|
| 211 |
-
|
| 212 |
-
def page_count(self) -> int:
|
| 213 |
-
return len(self._pdoc) # To be replaced with docling-parse API
|
| 214 |
-
|
| 215 |
-
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
| 216 |
-
return DoclingParsePageBackend(
|
| 217 |
-
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
| 218 |
-
)
|
| 219 |
-
|
| 220 |
-
def is_valid(self) -> bool:
|
| 221 |
-
return self.page_count() > 0
|
| 222 |
-
|
| 223 |
-
def unload(self):
|
| 224 |
-
super().unload()
|
| 225 |
-
self.parser.unload_document(self.document_hash)
|
| 226 |
-
self._pdoc.close()
|
| 227 |
-
self._pdoc = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_v2_backend.py
DELETED
|
@@ -1,250 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import random
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
| 6 |
-
|
| 7 |
-
import pypdfium2 as pdfium
|
| 8 |
-
from docling_core.types.doc import BoundingBox, CoordOrigin
|
| 9 |
-
from docling_parse.pdf_parsers import pdf_parser_v2
|
| 10 |
-
from PIL import Image, ImageDraw
|
| 11 |
-
from pypdfium2 import PdfPage
|
| 12 |
-
|
| 13 |
-
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
| 14 |
-
from docling.datamodel.base_models import Cell, Size
|
| 15 |
-
|
| 16 |
-
if TYPE_CHECKING:
|
| 17 |
-
from docling.datamodel.document import InputDocument
|
| 18 |
-
|
| 19 |
-
_log = logging.getLogger(__name__)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class DoclingParseV2PageBackend(PdfPageBackend):
|
| 23 |
-
def __init__(
|
| 24 |
-
self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
|
| 25 |
-
):
|
| 26 |
-
self._ppage = page_obj
|
| 27 |
-
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
| 28 |
-
|
| 29 |
-
self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
|
| 30 |
-
if self.valid:
|
| 31 |
-
self._dpage = parsed_page["pages"][0]
|
| 32 |
-
else:
|
| 33 |
-
_log.info(
|
| 34 |
-
f"An error occurred when loading page {page_no} of document {document_hash}."
|
| 35 |
-
)
|
| 36 |
-
|
| 37 |
-
def is_valid(self) -> bool:
|
| 38 |
-
return self.valid
|
| 39 |
-
|
| 40 |
-
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
| 41 |
-
if not self.valid:
|
| 42 |
-
return ""
|
| 43 |
-
# Find intersecting cells on the page
|
| 44 |
-
text_piece = ""
|
| 45 |
-
page_size = self.get_size()
|
| 46 |
-
|
| 47 |
-
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
| 48 |
-
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
| 49 |
-
|
| 50 |
-
scale = (
|
| 51 |
-
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
| 55 |
-
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
| 56 |
-
|
| 57 |
-
for i, cell_data in enumerate(cells_data):
|
| 58 |
-
x0 = cell_data[cells_header.index("x0")]
|
| 59 |
-
y0 = cell_data[cells_header.index("y0")]
|
| 60 |
-
x1 = cell_data[cells_header.index("x1")]
|
| 61 |
-
y1 = cell_data[cells_header.index("y1")]
|
| 62 |
-
|
| 63 |
-
cell_bbox = BoundingBox(
|
| 64 |
-
l=x0 * scale * page_size.width / parser_width,
|
| 65 |
-
b=y0 * scale * page_size.height / parser_height,
|
| 66 |
-
r=x1 * scale * page_size.width / parser_width,
|
| 67 |
-
t=y1 * scale * page_size.height / parser_height,
|
| 68 |
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
| 69 |
-
).to_top_left_origin(page_height=page_size.height * scale)
|
| 70 |
-
|
| 71 |
-
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
| 72 |
-
|
| 73 |
-
if overlap_frac > 0.5:
|
| 74 |
-
if len(text_piece) > 0:
|
| 75 |
-
text_piece += " "
|
| 76 |
-
text_piece += cell_data[cells_header.index("text")]
|
| 77 |
-
|
| 78 |
-
return text_piece
|
| 79 |
-
|
| 80 |
-
def get_text_cells(self) -> Iterable[Cell]:
|
| 81 |
-
cells: List[Cell] = []
|
| 82 |
-
cell_counter = 0
|
| 83 |
-
|
| 84 |
-
if not self.valid:
|
| 85 |
-
return cells
|
| 86 |
-
|
| 87 |
-
page_size = self.get_size()
|
| 88 |
-
|
| 89 |
-
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
| 90 |
-
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
| 91 |
-
|
| 92 |
-
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
| 93 |
-
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
| 94 |
-
|
| 95 |
-
for i, cell_data in enumerate(cells_data):
|
| 96 |
-
x0 = cell_data[cells_header.index("x0")]
|
| 97 |
-
y0 = cell_data[cells_header.index("y0")]
|
| 98 |
-
x1 = cell_data[cells_header.index("x1")]
|
| 99 |
-
y1 = cell_data[cells_header.index("y1")]
|
| 100 |
-
|
| 101 |
-
if x1 < x0:
|
| 102 |
-
x0, x1 = x1, x0
|
| 103 |
-
if y1 < y0:
|
| 104 |
-
y0, y1 = y1, y0
|
| 105 |
-
|
| 106 |
-
text_piece = cell_data[cells_header.index("text")]
|
| 107 |
-
cells.append(
|
| 108 |
-
Cell(
|
| 109 |
-
id=cell_counter,
|
| 110 |
-
text=text_piece,
|
| 111 |
-
bbox=BoundingBox(
|
| 112 |
-
# l=x0, b=y0, r=x1, t=y1,
|
| 113 |
-
l=x0 * page_size.width / parser_width,
|
| 114 |
-
b=y0 * page_size.height / parser_height,
|
| 115 |
-
r=x1 * page_size.width / parser_width,
|
| 116 |
-
t=y1 * page_size.height / parser_height,
|
| 117 |
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
| 118 |
-
).to_top_left_origin(page_size.height),
|
| 119 |
-
)
|
| 120 |
-
)
|
| 121 |
-
cell_counter += 1
|
| 122 |
-
|
| 123 |
-
def draw_clusters_and_cells():
|
| 124 |
-
image = (
|
| 125 |
-
self.get_page_image()
|
| 126 |
-
) # make new image to avoid drawing on the saved ones
|
| 127 |
-
draw = ImageDraw.Draw(image)
|
| 128 |
-
for c in cells:
|
| 129 |
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
| 130 |
-
cell_color = (
|
| 131 |
-
random.randint(30, 140),
|
| 132 |
-
random.randint(30, 140),
|
| 133 |
-
random.randint(30, 140),
|
| 134 |
-
)
|
| 135 |
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
| 136 |
-
image.show()
|
| 137 |
-
|
| 138 |
-
# draw_clusters_and_cells()
|
| 139 |
-
|
| 140 |
-
return cells
|
| 141 |
-
|
| 142 |
-
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
| 143 |
-
AREA_THRESHOLD = 0 # 32 * 32
|
| 144 |
-
|
| 145 |
-
images = self._dpage["sanitized"]["images"]["data"]
|
| 146 |
-
images_header = self._dpage["sanitized"]["images"]["header"]
|
| 147 |
-
|
| 148 |
-
for row in images:
|
| 149 |
-
x0 = row[images_header.index("x0")]
|
| 150 |
-
y0 = row[images_header.index("y0")]
|
| 151 |
-
x1 = row[images_header.index("x1")]
|
| 152 |
-
y1 = row[images_header.index("y1")]
|
| 153 |
-
|
| 154 |
-
cropbox = BoundingBox.from_tuple(
|
| 155 |
-
(x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
|
| 156 |
-
).to_top_left_origin(self.get_size().height)
|
| 157 |
-
|
| 158 |
-
if cropbox.area() > AREA_THRESHOLD:
|
| 159 |
-
cropbox = cropbox.scaled(scale=scale)
|
| 160 |
-
|
| 161 |
-
yield cropbox
|
| 162 |
-
|
| 163 |
-
def get_page_image(
|
| 164 |
-
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
| 165 |
-
) -> Image.Image:
|
| 166 |
-
|
| 167 |
-
page_size = self.get_size()
|
| 168 |
-
|
| 169 |
-
if not cropbox:
|
| 170 |
-
cropbox = BoundingBox(
|
| 171 |
-
l=0,
|
| 172 |
-
r=page_size.width,
|
| 173 |
-
t=0,
|
| 174 |
-
b=page_size.height,
|
| 175 |
-
coord_origin=CoordOrigin.TOPLEFT,
|
| 176 |
-
)
|
| 177 |
-
padbox = BoundingBox(
|
| 178 |
-
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
| 179 |
-
)
|
| 180 |
-
else:
|
| 181 |
-
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
| 182 |
-
padbox.r = page_size.width - padbox.r
|
| 183 |
-
padbox.t = page_size.height - padbox.t
|
| 184 |
-
|
| 185 |
-
image = (
|
| 186 |
-
self._ppage.render(
|
| 187 |
-
scale=scale * 1.5,
|
| 188 |
-
rotation=0, # no additional rotation
|
| 189 |
-
crop=padbox.as_tuple(),
|
| 190 |
-
)
|
| 191 |
-
.to_pil()
|
| 192 |
-
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
| 193 |
-
) # We resize the image from 1.5x the given scale to make it sharper.
|
| 194 |
-
|
| 195 |
-
return image
|
| 196 |
-
|
| 197 |
-
def get_size(self) -> Size:
|
| 198 |
-
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
| 199 |
-
|
| 200 |
-
def unload(self):
|
| 201 |
-
self._ppage = None
|
| 202 |
-
self._dpage = None
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
| 206 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 207 |
-
super().__init__(in_doc, path_or_stream)
|
| 208 |
-
|
| 209 |
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
| 210 |
-
self.parser = pdf_parser_v2("fatal")
|
| 211 |
-
|
| 212 |
-
success = False
|
| 213 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 214 |
-
success = self.parser.load_document_from_bytesio(
|
| 215 |
-
self.document_hash, self.path_or_stream
|
| 216 |
-
)
|
| 217 |
-
elif isinstance(self.path_or_stream, Path):
|
| 218 |
-
success = self.parser.load_document(
|
| 219 |
-
self.document_hash, str(self.path_or_stream)
|
| 220 |
-
)
|
| 221 |
-
|
| 222 |
-
if not success:
|
| 223 |
-
raise RuntimeError(
|
| 224 |
-
f"docling-parse v2 could not load document {self.document_hash}."
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
def page_count(self) -> int:
|
| 228 |
-
# return len(self._pdoc) # To be replaced with docling-parse API
|
| 229 |
-
|
| 230 |
-
len_1 = len(self._pdoc)
|
| 231 |
-
len_2 = self.parser.number_of_pages(self.document_hash)
|
| 232 |
-
|
| 233 |
-
if len_1 != len_2:
|
| 234 |
-
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
|
| 235 |
-
|
| 236 |
-
return len_2
|
| 237 |
-
|
| 238 |
-
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
| 239 |
-
return DoclingParseV2PageBackend(
|
| 240 |
-
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
def is_valid(self) -> bool:
|
| 244 |
-
return self.page_count() > 0
|
| 245 |
-
|
| 246 |
-
def unload(self):
|
| 247 |
-
super().unload()
|
| 248 |
-
self.parser.unload_document(self.document_hash)
|
| 249 |
-
self._pdoc.close()
|
| 250 |
-
self._pdoc = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/html_backend.py
DELETED
|
@@ -1,442 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Optional, Set, Union
|
| 5 |
-
|
| 6 |
-
from bs4 import BeautifulSoup, Tag
|
| 7 |
-
from docling_core.types.doc import (
|
| 8 |
-
DocItemLabel,
|
| 9 |
-
DoclingDocument,
|
| 10 |
-
DocumentOrigin,
|
| 11 |
-
GroupLabel,
|
| 12 |
-
TableCell,
|
| 13 |
-
TableData,
|
| 14 |
-
)
|
| 15 |
-
|
| 16 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 17 |
-
from docling.datamodel.base_models import InputFormat
|
| 18 |
-
from docling.datamodel.document import InputDocument
|
| 19 |
-
|
| 20 |
-
_log = logging.getLogger(__name__)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
| 24 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 25 |
-
super().__init__(in_doc, path_or_stream)
|
| 26 |
-
_log.debug("About to init HTML backend...")
|
| 27 |
-
self.soup: Optional[Tag] = None
|
| 28 |
-
# HTML file:
|
| 29 |
-
self.path_or_stream = path_or_stream
|
| 30 |
-
# Initialise the parents for the hierarchy
|
| 31 |
-
self.max_levels = 10
|
| 32 |
-
self.level = 0
|
| 33 |
-
self.parents = {} # type: ignore
|
| 34 |
-
for i in range(0, self.max_levels):
|
| 35 |
-
self.parents[i] = None
|
| 36 |
-
self.labels = {} # type: ignore
|
| 37 |
-
|
| 38 |
-
try:
|
| 39 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 40 |
-
text_stream = self.path_or_stream.getvalue()
|
| 41 |
-
self.soup = BeautifulSoup(text_stream, "html.parser")
|
| 42 |
-
if isinstance(self.path_or_stream, Path):
|
| 43 |
-
with open(self.path_or_stream, "rb") as f:
|
| 44 |
-
html_content = f.read()
|
| 45 |
-
self.soup = BeautifulSoup(html_content, "html.parser")
|
| 46 |
-
except Exception as e:
|
| 47 |
-
raise RuntimeError(
|
| 48 |
-
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
| 49 |
-
) from e
|
| 50 |
-
|
| 51 |
-
def is_valid(self) -> bool:
|
| 52 |
-
return self.soup is not None
|
| 53 |
-
|
| 54 |
-
@classmethod
|
| 55 |
-
def supports_pagination(cls) -> bool:
|
| 56 |
-
return False
|
| 57 |
-
|
| 58 |
-
def unload(self):
|
| 59 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 60 |
-
self.path_or_stream.close()
|
| 61 |
-
|
| 62 |
-
self.path_or_stream = None
|
| 63 |
-
|
| 64 |
-
@classmethod
|
| 65 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 66 |
-
return {InputFormat.HTML}
|
| 67 |
-
|
| 68 |
-
def convert(self) -> DoclingDocument:
|
| 69 |
-
# access self.path_or_stream to load stuff
|
| 70 |
-
origin = DocumentOrigin(
|
| 71 |
-
filename=self.file.name or "file",
|
| 72 |
-
mimetype="text/html",
|
| 73 |
-
binary_hash=self.document_hash,
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
| 77 |
-
_log.debug("Trying to convert HTML...")
|
| 78 |
-
|
| 79 |
-
if self.is_valid():
|
| 80 |
-
assert self.soup is not None
|
| 81 |
-
content = self.soup.body or self.soup
|
| 82 |
-
# Replace <br> tags with newline characters
|
| 83 |
-
for br in content.find_all("br"):
|
| 84 |
-
br.replace_with("\n")
|
| 85 |
-
doc = self.walk(content, doc)
|
| 86 |
-
else:
|
| 87 |
-
raise RuntimeError(
|
| 88 |
-
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
| 89 |
-
)
|
| 90 |
-
return doc
|
| 91 |
-
|
| 92 |
-
def walk(self, element: Tag, doc: DoclingDocument):
|
| 93 |
-
try:
|
| 94 |
-
# Iterate over elements in the body of the document
|
| 95 |
-
for idx, element in enumerate(element.children):
|
| 96 |
-
try:
|
| 97 |
-
self.analyse_element(element, idx, doc)
|
| 98 |
-
except Exception as exc_child:
|
| 99 |
-
|
| 100 |
-
_log.error(" -> error treating child: ", exc_child)
|
| 101 |
-
_log.error(" => element: ", element, "\n")
|
| 102 |
-
raise exc_child
|
| 103 |
-
|
| 104 |
-
except Exception as exc:
|
| 105 |
-
pass
|
| 106 |
-
|
| 107 |
-
return doc
|
| 108 |
-
|
| 109 |
-
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 110 |
-
"""
|
| 111 |
-
if element.name!=None:
|
| 112 |
-
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
| 113 |
-
"""
|
| 114 |
-
|
| 115 |
-
if element.name in self.labels:
|
| 116 |
-
self.labels[element.name] += 1
|
| 117 |
-
else:
|
| 118 |
-
self.labels[element.name] = 1
|
| 119 |
-
|
| 120 |
-
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
| 121 |
-
self.handle_header(element, idx, doc)
|
| 122 |
-
elif element.name in ["p"]:
|
| 123 |
-
self.handle_paragraph(element, idx, doc)
|
| 124 |
-
elif element.name in ["pre"]:
|
| 125 |
-
self.handle_code(element, idx, doc)
|
| 126 |
-
elif element.name in ["ul", "ol"]:
|
| 127 |
-
self.handle_list(element, idx, doc)
|
| 128 |
-
elif element.name in ["li"]:
|
| 129 |
-
self.handle_listitem(element, idx, doc)
|
| 130 |
-
elif element.name == "table":
|
| 131 |
-
self.handle_table(element, idx, doc)
|
| 132 |
-
elif element.name == "figure":
|
| 133 |
-
self.handle_figure(element, idx, doc)
|
| 134 |
-
elif element.name == "img":
|
| 135 |
-
self.handle_image(element, idx, doc)
|
| 136 |
-
else:
|
| 137 |
-
self.walk(element, doc)
|
| 138 |
-
|
| 139 |
-
def get_direct_text(self, item: Tag):
|
| 140 |
-
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
| 141 |
-
text = item.find(string=True, recursive=False)
|
| 142 |
-
if isinstance(text, str):
|
| 143 |
-
return text.strip()
|
| 144 |
-
|
| 145 |
-
return ""
|
| 146 |
-
|
| 147 |
-
# Function to recursively extract text from all child nodes
|
| 148 |
-
def extract_text_recursively(self, item: Tag):
|
| 149 |
-
result = []
|
| 150 |
-
|
| 151 |
-
if isinstance(item, str):
|
| 152 |
-
return [item]
|
| 153 |
-
|
| 154 |
-
if item.name not in ["ul", "ol"]:
|
| 155 |
-
try:
|
| 156 |
-
# Iterate over the children (and their text and tails)
|
| 157 |
-
for child in item:
|
| 158 |
-
try:
|
| 159 |
-
# Recursively get the child's text content
|
| 160 |
-
result.extend(self.extract_text_recursively(child))
|
| 161 |
-
except:
|
| 162 |
-
pass
|
| 163 |
-
except:
|
| 164 |
-
_log.warn("item has no children")
|
| 165 |
-
pass
|
| 166 |
-
|
| 167 |
-
return "".join(result) + " "
|
| 168 |
-
|
| 169 |
-
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 170 |
-
"""Handles header tags (h1, h2, etc.)."""
|
| 171 |
-
hlevel = int(element.name.replace("h", ""))
|
| 172 |
-
slevel = hlevel - 1
|
| 173 |
-
|
| 174 |
-
label = DocItemLabel.SECTION_HEADER
|
| 175 |
-
text = element.text.strip()
|
| 176 |
-
|
| 177 |
-
if hlevel == 1:
|
| 178 |
-
for key, val in self.parents.items():
|
| 179 |
-
self.parents[key] = None
|
| 180 |
-
|
| 181 |
-
self.level = 1
|
| 182 |
-
self.parents[self.level] = doc.add_text(
|
| 183 |
-
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
| 184 |
-
)
|
| 185 |
-
else:
|
| 186 |
-
if hlevel > self.level:
|
| 187 |
-
|
| 188 |
-
# add invisible group
|
| 189 |
-
for i in range(self.level + 1, hlevel):
|
| 190 |
-
self.parents[i] = doc.add_group(
|
| 191 |
-
name=f"header-{i}",
|
| 192 |
-
label=GroupLabel.SECTION,
|
| 193 |
-
parent=self.parents[i - 1],
|
| 194 |
-
)
|
| 195 |
-
self.level = hlevel
|
| 196 |
-
|
| 197 |
-
elif hlevel < self.level:
|
| 198 |
-
|
| 199 |
-
# remove the tail
|
| 200 |
-
for key, val in self.parents.items():
|
| 201 |
-
if key > hlevel:
|
| 202 |
-
self.parents[key] = None
|
| 203 |
-
self.level = hlevel
|
| 204 |
-
|
| 205 |
-
self.parents[hlevel] = doc.add_heading(
|
| 206 |
-
parent=self.parents[hlevel - 1],
|
| 207 |
-
text=text,
|
| 208 |
-
level=hlevel,
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 212 |
-
"""Handles monospace code snippets (pre)."""
|
| 213 |
-
if element.text is None:
|
| 214 |
-
return
|
| 215 |
-
text = element.text.strip()
|
| 216 |
-
label = DocItemLabel.CODE
|
| 217 |
-
if len(text) == 0:
|
| 218 |
-
return
|
| 219 |
-
doc.add_code(parent=self.parents[self.level], text=text)
|
| 220 |
-
|
| 221 |
-
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 222 |
-
"""Handles paragraph tags (p)."""
|
| 223 |
-
if element.text is None:
|
| 224 |
-
return
|
| 225 |
-
text = element.text.strip()
|
| 226 |
-
label = DocItemLabel.PARAGRAPH
|
| 227 |
-
if len(text) == 0:
|
| 228 |
-
return
|
| 229 |
-
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
| 230 |
-
|
| 231 |
-
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 232 |
-
"""Handles list tags (ul, ol) and their list items."""
|
| 233 |
-
|
| 234 |
-
if element.name == "ul":
|
| 235 |
-
# create a list group
|
| 236 |
-
self.parents[self.level + 1] = doc.add_group(
|
| 237 |
-
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
| 238 |
-
)
|
| 239 |
-
elif element.name == "ol":
|
| 240 |
-
# create a list group
|
| 241 |
-
self.parents[self.level + 1] = doc.add_group(
|
| 242 |
-
parent=self.parents[self.level],
|
| 243 |
-
name="ordered list",
|
| 244 |
-
label=GroupLabel.ORDERED_LIST,
|
| 245 |
-
)
|
| 246 |
-
self.level += 1
|
| 247 |
-
|
| 248 |
-
self.walk(element, doc)
|
| 249 |
-
|
| 250 |
-
self.parents[self.level + 1] = None
|
| 251 |
-
self.level -= 1
|
| 252 |
-
|
| 253 |
-
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 254 |
-
"""Handles listitem tags (li)."""
|
| 255 |
-
nested_lists = element.find(["ul", "ol"])
|
| 256 |
-
|
| 257 |
-
parent_list_label = self.parents[self.level].label
|
| 258 |
-
index_in_list = len(self.parents[self.level].children) + 1
|
| 259 |
-
|
| 260 |
-
if nested_lists:
|
| 261 |
-
name = element.name
|
| 262 |
-
# Text in list item can be hidden within hierarchy, hence
|
| 263 |
-
# we need to extract it recursively
|
| 264 |
-
text = self.extract_text_recursively(element)
|
| 265 |
-
# Flatten text, remove break lines:
|
| 266 |
-
text = text.replace("\n", "").replace("\r", "")
|
| 267 |
-
text = " ".join(text.split()).strip()
|
| 268 |
-
|
| 269 |
-
marker = ""
|
| 270 |
-
enumerated = False
|
| 271 |
-
if parent_list_label == GroupLabel.ORDERED_LIST:
|
| 272 |
-
marker = str(index_in_list)
|
| 273 |
-
enumerated = True
|
| 274 |
-
|
| 275 |
-
if len(text) > 0:
|
| 276 |
-
# create a list-item
|
| 277 |
-
self.parents[self.level + 1] = doc.add_list_item(
|
| 278 |
-
text=text,
|
| 279 |
-
enumerated=enumerated,
|
| 280 |
-
marker=marker,
|
| 281 |
-
parent=self.parents[self.level],
|
| 282 |
-
)
|
| 283 |
-
self.level += 1
|
| 284 |
-
|
| 285 |
-
self.walk(element, doc)
|
| 286 |
-
|
| 287 |
-
self.parents[self.level + 1] = None
|
| 288 |
-
self.level -= 1
|
| 289 |
-
|
| 290 |
-
elif isinstance(element.text, str):
|
| 291 |
-
text = element.text.strip()
|
| 292 |
-
|
| 293 |
-
marker = ""
|
| 294 |
-
enumerated = False
|
| 295 |
-
if parent_list_label == GroupLabel.ORDERED_LIST:
|
| 296 |
-
marker = f"{str(index_in_list)}."
|
| 297 |
-
enumerated = True
|
| 298 |
-
doc.add_list_item(
|
| 299 |
-
text=text,
|
| 300 |
-
enumerated=enumerated,
|
| 301 |
-
marker=marker,
|
| 302 |
-
parent=self.parents[self.level],
|
| 303 |
-
)
|
| 304 |
-
else:
|
| 305 |
-
_log.warn("list-item has no text: ", element)
|
| 306 |
-
|
| 307 |
-
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 308 |
-
"""Handles table tags."""
|
| 309 |
-
|
| 310 |
-
nested_tables = element.find("table")
|
| 311 |
-
if nested_tables is not None:
|
| 312 |
-
_log.warn("detected nested tables: skipping for now")
|
| 313 |
-
return
|
| 314 |
-
|
| 315 |
-
# Count the number of rows (number of <tr> elements)
|
| 316 |
-
num_rows = len(element.find_all("tr"))
|
| 317 |
-
|
| 318 |
-
# Find the number of columns (taking into account colspan)
|
| 319 |
-
num_cols = 0
|
| 320 |
-
for row in element.find_all("tr"):
|
| 321 |
-
col_count = 0
|
| 322 |
-
for cell in row.find_all(["td", "th"]):
|
| 323 |
-
colspan = int(cell.get("colspan", 1))
|
| 324 |
-
col_count += colspan
|
| 325 |
-
num_cols = max(num_cols, col_count)
|
| 326 |
-
|
| 327 |
-
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
| 328 |
-
|
| 329 |
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
| 330 |
-
|
| 331 |
-
# Iterate over the rows in the table
|
| 332 |
-
for row_idx, row in enumerate(element.find_all("tr")):
|
| 333 |
-
|
| 334 |
-
# For each row, find all the column cells (both <td> and <th>)
|
| 335 |
-
cells = row.find_all(["td", "th"])
|
| 336 |
-
|
| 337 |
-
# Check if each cell in the row is a header -> means it is a column header
|
| 338 |
-
col_header = True
|
| 339 |
-
for j, html_cell in enumerate(cells):
|
| 340 |
-
if html_cell.name == "td":
|
| 341 |
-
col_header = False
|
| 342 |
-
|
| 343 |
-
col_idx = 0
|
| 344 |
-
# Extract and print the text content of each cell
|
| 345 |
-
for _, html_cell in enumerate(cells):
|
| 346 |
-
|
| 347 |
-
text = html_cell.text
|
| 348 |
-
try:
|
| 349 |
-
text = self.extract_table_cell_text(html_cell)
|
| 350 |
-
except Exception as exc:
|
| 351 |
-
_log.warn("exception: ", exc)
|
| 352 |
-
exit(-1)
|
| 353 |
-
|
| 354 |
-
# label = html_cell.name
|
| 355 |
-
|
| 356 |
-
col_span = int(html_cell.get("colspan", 1))
|
| 357 |
-
row_span = int(html_cell.get("rowspan", 1))
|
| 358 |
-
|
| 359 |
-
while grid[row_idx][col_idx] is not None:
|
| 360 |
-
col_idx += 1
|
| 361 |
-
for r in range(row_span):
|
| 362 |
-
for c in range(col_span):
|
| 363 |
-
grid[row_idx + r][col_idx + c] = text
|
| 364 |
-
|
| 365 |
-
cell = TableCell(
|
| 366 |
-
text=text,
|
| 367 |
-
row_span=row_span,
|
| 368 |
-
col_span=col_span,
|
| 369 |
-
start_row_offset_idx=row_idx,
|
| 370 |
-
end_row_offset_idx=row_idx + row_span,
|
| 371 |
-
start_col_offset_idx=col_idx,
|
| 372 |
-
end_col_offset_idx=col_idx + col_span,
|
| 373 |
-
col_header=col_header,
|
| 374 |
-
row_header=((not col_header) and html_cell.name == "th"),
|
| 375 |
-
)
|
| 376 |
-
data.table_cells.append(cell)
|
| 377 |
-
|
| 378 |
-
doc.add_table(data=data, parent=self.parents[self.level])
|
| 379 |
-
|
| 380 |
-
def get_list_text(self, list_element: Tag, level=0):
|
| 381 |
-
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
| 382 |
-
result = []
|
| 383 |
-
bullet_char = "*" # Default bullet character for unordered lists
|
| 384 |
-
|
| 385 |
-
if list_element.name == "ol": # For ordered lists, use numbers
|
| 386 |
-
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
|
| 387 |
-
# Add numbering for ordered lists
|
| 388 |
-
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
| 389 |
-
# Handle nested lists
|
| 390 |
-
nested_list = li.find(["ul", "ol"])
|
| 391 |
-
if nested_list:
|
| 392 |
-
result.extend(self.get_list_text(nested_list, level + 1))
|
| 393 |
-
elif list_element.name == "ul": # For unordered lists, use bullet points
|
| 394 |
-
for li in list_element.find_all("li", recursive=False):
|
| 395 |
-
# Add bullet points for unordered lists
|
| 396 |
-
result.append(
|
| 397 |
-
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
| 398 |
-
)
|
| 399 |
-
# Handle nested lists
|
| 400 |
-
nested_list = li.find(["ul", "ol"])
|
| 401 |
-
if nested_list:
|
| 402 |
-
result.extend(self.get_list_text(nested_list, level + 1))
|
| 403 |
-
|
| 404 |
-
return result
|
| 405 |
-
|
| 406 |
-
def extract_table_cell_text(self, cell: Tag):
|
| 407 |
-
"""Extract text from a table cell, including lists with indents."""
|
| 408 |
-
contains_lists = cell.find(["ul", "ol"])
|
| 409 |
-
if contains_lists is None:
|
| 410 |
-
return cell.text
|
| 411 |
-
else:
|
| 412 |
-
_log.debug(
|
| 413 |
-
"should extract the content correctly for table-cells with lists ..."
|
| 414 |
-
)
|
| 415 |
-
return cell.text
|
| 416 |
-
|
| 417 |
-
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
| 418 |
-
"""Handles image tags (img)."""
|
| 419 |
-
|
| 420 |
-
# Extract the image URI from the <img> tag
|
| 421 |
-
# image_uri = root.xpath('//figure//img/@src')[0]
|
| 422 |
-
|
| 423 |
-
contains_captions = element.find(["figcaption"])
|
| 424 |
-
if contains_captions is None:
|
| 425 |
-
doc.add_picture(parent=self.parents[self.level], caption=None)
|
| 426 |
-
|
| 427 |
-
else:
|
| 428 |
-
texts = []
|
| 429 |
-
for item in contains_captions:
|
| 430 |
-
texts.append(item.text)
|
| 431 |
-
|
| 432 |
-
fig_caption = doc.add_text(
|
| 433 |
-
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
| 434 |
-
)
|
| 435 |
-
doc.add_picture(
|
| 436 |
-
parent=self.parents[self.level],
|
| 437 |
-
caption=fig_caption,
|
| 438 |
-
)
|
| 439 |
-
|
| 440 |
-
def handle_image(self, element: Tag, idx, doc: DoclingDocument):
|
| 441 |
-
"""Handles image tags (img)."""
|
| 442 |
-
doc.add_picture(parent=self.parents[self.level], caption=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/docling_json_backend.py
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
from io import BytesIO
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from typing import Union
|
| 4 |
-
|
| 5 |
-
from docling_core.types.doc import DoclingDocument
|
| 6 |
-
from typing_extensions import override
|
| 7 |
-
|
| 8 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 9 |
-
from docling.datamodel.base_models import InputFormat
|
| 10 |
-
from docling.datamodel.document import InputDocument
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
| 14 |
-
@override
|
| 15 |
-
def __init__(
|
| 16 |
-
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
| 17 |
-
) -> None:
|
| 18 |
-
super().__init__(in_doc, path_or_stream)
|
| 19 |
-
|
| 20 |
-
# given we need to store any actual conversion exception for raising it from
|
| 21 |
-
# convert(), this captures the successful result or the actual error in a
|
| 22 |
-
# mutually exclusive way:
|
| 23 |
-
self._doc_or_err = self._get_doc_or_err()
|
| 24 |
-
|
| 25 |
-
@override
|
| 26 |
-
def is_valid(self) -> bool:
|
| 27 |
-
return isinstance(self._doc_or_err, DoclingDocument)
|
| 28 |
-
|
| 29 |
-
@classmethod
|
| 30 |
-
@override
|
| 31 |
-
def supports_pagination(cls) -> bool:
|
| 32 |
-
return False
|
| 33 |
-
|
| 34 |
-
@classmethod
|
| 35 |
-
@override
|
| 36 |
-
def supported_formats(cls) -> set[InputFormat]:
|
| 37 |
-
return {InputFormat.JSON_DOCLING}
|
| 38 |
-
|
| 39 |
-
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
| 40 |
-
try:
|
| 41 |
-
json_data: Union[str, bytes]
|
| 42 |
-
if isinstance(self.path_or_stream, Path):
|
| 43 |
-
with open(self.path_or_stream, encoding="utf-8") as f:
|
| 44 |
-
json_data = f.read()
|
| 45 |
-
elif isinstance(self.path_or_stream, BytesIO):
|
| 46 |
-
json_data = self.path_or_stream.getvalue()
|
| 47 |
-
else:
|
| 48 |
-
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
| 49 |
-
return DoclingDocument.model_validate_json(json_data=json_data)
|
| 50 |
-
except Exception as e:
|
| 51 |
-
return e
|
| 52 |
-
|
| 53 |
-
@override
|
| 54 |
-
def convert(self) -> DoclingDocument:
|
| 55 |
-
if isinstance(self._doc_or_err, DoclingDocument):
|
| 56 |
-
return self._doc_or_err
|
| 57 |
-
else:
|
| 58 |
-
raise self._doc_or_err
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/md_backend.py
DELETED
|
@@ -1,428 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import re
|
| 3 |
-
import warnings
|
| 4 |
-
from io import BytesIO
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from typing import List, Optional, Set, Union
|
| 7 |
-
|
| 8 |
-
import marko
|
| 9 |
-
import marko.element
|
| 10 |
-
import marko.ext
|
| 11 |
-
import marko.ext.gfm
|
| 12 |
-
import marko.inline
|
| 13 |
-
from docling_core.types.doc import (
|
| 14 |
-
DocItem,
|
| 15 |
-
DocItemLabel,
|
| 16 |
-
DoclingDocument,
|
| 17 |
-
DocumentOrigin,
|
| 18 |
-
GroupLabel,
|
| 19 |
-
NodeItem,
|
| 20 |
-
TableCell,
|
| 21 |
-
TableData,
|
| 22 |
-
TextItem,
|
| 23 |
-
)
|
| 24 |
-
from marko import Markdown
|
| 25 |
-
|
| 26 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 27 |
-
from docling.backend.html_backend import HTMLDocumentBackend
|
| 28 |
-
from docling.datamodel.base_models import InputFormat
|
| 29 |
-
from docling.datamodel.document import InputDocument
|
| 30 |
-
|
| 31 |
-
_log = logging.getLogger(__name__)
|
| 32 |
-
|
| 33 |
-
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
|
| 34 |
-
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
| 35 |
-
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
| 39 |
-
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
| 40 |
-
# This regex will match any sequence of underscores
|
| 41 |
-
pattern = r"_+"
|
| 42 |
-
|
| 43 |
-
def replace_match(match):
|
| 44 |
-
underscore_sequence = match.group(
|
| 45 |
-
0
|
| 46 |
-
) # Get the full match (sequence of underscores)
|
| 47 |
-
|
| 48 |
-
# Shorten the sequence if it exceeds max_length
|
| 49 |
-
if len(underscore_sequence) > max_length:
|
| 50 |
-
return "_" * max_length
|
| 51 |
-
else:
|
| 52 |
-
return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
|
| 53 |
-
|
| 54 |
-
# Use re.sub to replace long underscore sequences
|
| 55 |
-
shortened_text = re.sub(pattern, replace_match, markdown_text)
|
| 56 |
-
|
| 57 |
-
if len(shortened_text) != len(markdown_text):
|
| 58 |
-
warnings.warn("Detected potentially incorrect Markdown, correcting...")
|
| 59 |
-
|
| 60 |
-
return shortened_text
|
| 61 |
-
|
| 62 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 63 |
-
super().__init__(in_doc, path_or_stream)
|
| 64 |
-
|
| 65 |
-
_log.debug("MD INIT!!!")
|
| 66 |
-
|
| 67 |
-
# Markdown file:
|
| 68 |
-
self.path_or_stream = path_or_stream
|
| 69 |
-
self.valid = True
|
| 70 |
-
self.markdown = "" # To store original Markdown string
|
| 71 |
-
|
| 72 |
-
self.in_table = False
|
| 73 |
-
self.md_table_buffer: list[str] = []
|
| 74 |
-
self.inline_texts: list[str] = []
|
| 75 |
-
self._html_blocks: int = 0
|
| 76 |
-
|
| 77 |
-
try:
|
| 78 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 79 |
-
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
| 80 |
-
# remove invalid sequences
|
| 81 |
-
# very long sequences of underscores will lead to unnecessary long processing times.
|
| 82 |
-
# In any proper Markdown files, underscores have to be escaped,
|
| 83 |
-
# otherwise they represent emphasis (bold or italic)
|
| 84 |
-
self.markdown = self._shorten_underscore_sequences(text_stream)
|
| 85 |
-
if isinstance(self.path_or_stream, Path):
|
| 86 |
-
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
| 87 |
-
md_content = f.read()
|
| 88 |
-
# remove invalid sequences
|
| 89 |
-
# very long sequences of underscores will lead to unnecessary long processing times.
|
| 90 |
-
# In any proper Markdown files, underscores have to be escaped,
|
| 91 |
-
# otherwise they represent emphasis (bold or italic)
|
| 92 |
-
self.markdown = self._shorten_underscore_sequences(md_content)
|
| 93 |
-
self.valid = True
|
| 94 |
-
|
| 95 |
-
_log.debug(self.markdown)
|
| 96 |
-
except Exception as e:
|
| 97 |
-
raise RuntimeError(
|
| 98 |
-
f"Could not initialize MD backend for file with hash {self.document_hash}."
|
| 99 |
-
) from e
|
| 100 |
-
return
|
| 101 |
-
|
| 102 |
-
def _close_table(self, doc: DoclingDocument):
|
| 103 |
-
if self.in_table:
|
| 104 |
-
_log.debug("=== TABLE START ===")
|
| 105 |
-
for md_table_row in self.md_table_buffer:
|
| 106 |
-
_log.debug(md_table_row)
|
| 107 |
-
_log.debug("=== TABLE END ===")
|
| 108 |
-
tcells: List[TableCell] = []
|
| 109 |
-
result_table = []
|
| 110 |
-
for n, md_table_row in enumerate(self.md_table_buffer):
|
| 111 |
-
data = []
|
| 112 |
-
if n == 0:
|
| 113 |
-
header = [t.strip() for t in md_table_row.split("|")[1:-1]]
|
| 114 |
-
for value in header:
|
| 115 |
-
data.append(value)
|
| 116 |
-
result_table.append(data)
|
| 117 |
-
if n > 1:
|
| 118 |
-
values = [t.strip() for t in md_table_row.split("|")[1:-1]]
|
| 119 |
-
for value in values:
|
| 120 |
-
data.append(value)
|
| 121 |
-
result_table.append(data)
|
| 122 |
-
|
| 123 |
-
for trow_ind, trow in enumerate(result_table):
|
| 124 |
-
for tcol_ind, cellval in enumerate(trow):
|
| 125 |
-
row_span = (
|
| 126 |
-
1 # currently supporting just simple tables (without spans)
|
| 127 |
-
)
|
| 128 |
-
col_span = (
|
| 129 |
-
1 # currently supporting just simple tables (without spans)
|
| 130 |
-
)
|
| 131 |
-
icell = TableCell(
|
| 132 |
-
text=cellval.strip(),
|
| 133 |
-
row_span=row_span,
|
| 134 |
-
col_span=col_span,
|
| 135 |
-
start_row_offset_idx=trow_ind,
|
| 136 |
-
end_row_offset_idx=trow_ind + row_span,
|
| 137 |
-
start_col_offset_idx=tcol_ind,
|
| 138 |
-
end_col_offset_idx=tcol_ind + col_span,
|
| 139 |
-
col_header=False,
|
| 140 |
-
row_header=False,
|
| 141 |
-
)
|
| 142 |
-
tcells.append(icell)
|
| 143 |
-
|
| 144 |
-
num_rows = len(result_table)
|
| 145 |
-
num_cols = len(result_table[0])
|
| 146 |
-
self.in_table = False
|
| 147 |
-
self.md_table_buffer = [] # clean table markdown buffer
|
| 148 |
-
# Initialize Docling TableData
|
| 149 |
-
table_data = TableData(
|
| 150 |
-
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
|
| 151 |
-
)
|
| 152 |
-
# Populate
|
| 153 |
-
for tcell in tcells:
|
| 154 |
-
table_data.table_cells.append(tcell)
|
| 155 |
-
if len(tcells) > 0:
|
| 156 |
-
doc.add_table(data=table_data)
|
| 157 |
-
return
|
| 158 |
-
|
| 159 |
-
def _process_inline_text(
|
| 160 |
-
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
| 161 |
-
):
|
| 162 |
-
txt = " ".join(self.inline_texts)
|
| 163 |
-
if len(txt) > 0:
|
| 164 |
-
doc.add_text(
|
| 165 |
-
label=DocItemLabel.PARAGRAPH,
|
| 166 |
-
parent=parent_item,
|
| 167 |
-
text=txt,
|
| 168 |
-
)
|
| 169 |
-
self.inline_texts = []
|
| 170 |
-
|
| 171 |
-
def _iterate_elements(
|
| 172 |
-
self,
|
| 173 |
-
element: marko.element.Element,
|
| 174 |
-
depth: int,
|
| 175 |
-
doc: DoclingDocument,
|
| 176 |
-
visited: Set[marko.element.Element],
|
| 177 |
-
parent_item: Optional[NodeItem] = None,
|
| 178 |
-
):
|
| 179 |
-
|
| 180 |
-
if element in visited:
|
| 181 |
-
return
|
| 182 |
-
|
| 183 |
-
# Iterates over all elements in the AST
|
| 184 |
-
# Check for different element types and process relevant details
|
| 185 |
-
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
| 186 |
-
self._close_table(doc)
|
| 187 |
-
self._process_inline_text(parent_item, doc)
|
| 188 |
-
_log.debug(
|
| 189 |
-
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
| 190 |
-
)
|
| 191 |
-
if element.level == 1:
|
| 192 |
-
doc_label = DocItemLabel.TITLE
|
| 193 |
-
else:
|
| 194 |
-
doc_label = DocItemLabel.SECTION_HEADER
|
| 195 |
-
|
| 196 |
-
# Header could have arbitrary inclusion of bold, italic or emphasis,
|
| 197 |
-
# hence we need to traverse the tree to get full text of a header
|
| 198 |
-
strings: List[str] = []
|
| 199 |
-
|
| 200 |
-
# Define a recursive function to traverse the tree
|
| 201 |
-
def traverse(node: marko.block.BlockElement):
|
| 202 |
-
# Check if the node has a "children" attribute
|
| 203 |
-
if hasattr(node, "children"):
|
| 204 |
-
# If "children" is a list, continue traversal
|
| 205 |
-
if isinstance(node.children, list):
|
| 206 |
-
for child in node.children:
|
| 207 |
-
traverse(child)
|
| 208 |
-
# If "children" is text, add it to header text
|
| 209 |
-
elif isinstance(node.children, str):
|
| 210 |
-
strings.append(node.children)
|
| 211 |
-
|
| 212 |
-
traverse(element)
|
| 213 |
-
snippet_text = "".join(strings)
|
| 214 |
-
if len(snippet_text) > 0:
|
| 215 |
-
parent_item = doc.add_text(
|
| 216 |
-
label=doc_label, parent=parent_item, text=snippet_text
|
| 217 |
-
)
|
| 218 |
-
|
| 219 |
-
elif isinstance(element, marko.block.List):
|
| 220 |
-
has_non_empty_list_items = False
|
| 221 |
-
for child in element.children:
|
| 222 |
-
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
|
| 223 |
-
has_non_empty_list_items = True
|
| 224 |
-
break
|
| 225 |
-
|
| 226 |
-
self._close_table(doc)
|
| 227 |
-
self._process_inline_text(parent_item, doc)
|
| 228 |
-
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
| 229 |
-
if has_non_empty_list_items:
|
| 230 |
-
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
| 231 |
-
parent_item = doc.add_group(
|
| 232 |
-
label=label, name=f"list", parent=parent_item
|
| 233 |
-
)
|
| 234 |
-
|
| 235 |
-
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
| 236 |
-
self._close_table(doc)
|
| 237 |
-
self._process_inline_text(parent_item, doc)
|
| 238 |
-
_log.debug(" - List item")
|
| 239 |
-
|
| 240 |
-
first_child = element.children[0]
|
| 241 |
-
snippet_text = str(first_child.children[0].children) # type: ignore
|
| 242 |
-
is_numbered = False
|
| 243 |
-
if (
|
| 244 |
-
parent_item is not None
|
| 245 |
-
and isinstance(parent_item, DocItem)
|
| 246 |
-
and parent_item.label == GroupLabel.ORDERED_LIST
|
| 247 |
-
):
|
| 248 |
-
is_numbered = True
|
| 249 |
-
doc.add_list_item(
|
| 250 |
-
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
| 251 |
-
)
|
| 252 |
-
visited.add(first_child)
|
| 253 |
-
|
| 254 |
-
elif isinstance(element, marko.inline.Image):
|
| 255 |
-
self._close_table(doc)
|
| 256 |
-
self._process_inline_text(parent_item, doc)
|
| 257 |
-
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
| 258 |
-
|
| 259 |
-
fig_caption: Optional[TextItem] = None
|
| 260 |
-
if element.title is not None and element.title != "":
|
| 261 |
-
fig_caption = doc.add_text(
|
| 262 |
-
label=DocItemLabel.CAPTION, text=element.title
|
| 263 |
-
)
|
| 264 |
-
|
| 265 |
-
doc.add_picture(parent=parent_item, caption=fig_caption)
|
| 266 |
-
|
| 267 |
-
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
| 268 |
-
self._process_inline_text(parent_item, doc)
|
| 269 |
-
|
| 270 |
-
elif isinstance(element, marko.inline.RawText):
|
| 271 |
-
_log.debug(f" - Paragraph (raw text): {element.children}")
|
| 272 |
-
snippet_text = element.children.strip()
|
| 273 |
-
# Detect start of the table:
|
| 274 |
-
if "|" in snippet_text:
|
| 275 |
-
# most likely part of the markdown table
|
| 276 |
-
self.in_table = True
|
| 277 |
-
if len(self.md_table_buffer) > 0:
|
| 278 |
-
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
| 279 |
-
else:
|
| 280 |
-
self.md_table_buffer.append(snippet_text)
|
| 281 |
-
else:
|
| 282 |
-
self._close_table(doc)
|
| 283 |
-
# most likely just inline text
|
| 284 |
-
self.inline_texts.append(str(element.children))
|
| 285 |
-
|
| 286 |
-
elif isinstance(element, marko.inline.CodeSpan):
|
| 287 |
-
self._close_table(doc)
|
| 288 |
-
self._process_inline_text(parent_item, doc)
|
| 289 |
-
_log.debug(f" - Code Span: {element.children}")
|
| 290 |
-
snippet_text = str(element.children).strip()
|
| 291 |
-
doc.add_code(parent=parent_item, text=snippet_text)
|
| 292 |
-
|
| 293 |
-
elif (
|
| 294 |
-
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
| 295 |
-
and len(element.children) > 0
|
| 296 |
-
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
| 297 |
-
and len(snippet_text := (first_child.children.strip())) > 0
|
| 298 |
-
):
|
| 299 |
-
self._close_table(doc)
|
| 300 |
-
self._process_inline_text(parent_item, doc)
|
| 301 |
-
_log.debug(f" - Code Block: {element.children}")
|
| 302 |
-
doc.add_code(parent=parent_item, text=snippet_text)
|
| 303 |
-
|
| 304 |
-
elif isinstance(element, marko.inline.LineBreak):
|
| 305 |
-
if self.in_table:
|
| 306 |
-
_log.debug("Line break in a table")
|
| 307 |
-
self.md_table_buffer.append("")
|
| 308 |
-
|
| 309 |
-
elif isinstance(element, marko.block.HTMLBlock):
|
| 310 |
-
self._html_blocks += 1
|
| 311 |
-
self._process_inline_text(parent_item, doc)
|
| 312 |
-
self._close_table(doc)
|
| 313 |
-
_log.debug("HTML Block: {}".format(element))
|
| 314 |
-
if (
|
| 315 |
-
len(element.body) > 0
|
| 316 |
-
): # If Marko doesn't return any content for HTML block, skip it
|
| 317 |
-
html_block = element.body.strip()
|
| 318 |
-
|
| 319 |
-
# wrap in markers to enable post-processing in convert()
|
| 320 |
-
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
| 321 |
-
doc.add_code(parent=parent_item, text=text_to_add)
|
| 322 |
-
else:
|
| 323 |
-
if not isinstance(element, str):
|
| 324 |
-
self._close_table(doc)
|
| 325 |
-
_log.debug("Some other element: {}".format(element))
|
| 326 |
-
|
| 327 |
-
processed_block_types = (
|
| 328 |
-
marko.block.Heading,
|
| 329 |
-
marko.block.CodeBlock,
|
| 330 |
-
marko.block.FencedCode,
|
| 331 |
-
marko.inline.RawText,
|
| 332 |
-
)
|
| 333 |
-
|
| 334 |
-
# Iterate through the element's children (if any)
|
| 335 |
-
if hasattr(element, "children") and not isinstance(
|
| 336 |
-
element, processed_block_types
|
| 337 |
-
):
|
| 338 |
-
for child in element.children:
|
| 339 |
-
self._iterate_elements(
|
| 340 |
-
element=child,
|
| 341 |
-
depth=depth + 1,
|
| 342 |
-
doc=doc,
|
| 343 |
-
visited=visited,
|
| 344 |
-
parent_item=parent_item,
|
| 345 |
-
)
|
| 346 |
-
|
| 347 |
-
def is_valid(self) -> bool:
|
| 348 |
-
return self.valid
|
| 349 |
-
|
| 350 |
-
def unload(self):
|
| 351 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 352 |
-
self.path_or_stream.close()
|
| 353 |
-
self.path_or_stream = None
|
| 354 |
-
|
| 355 |
-
@classmethod
|
| 356 |
-
def supports_pagination(cls) -> bool:
|
| 357 |
-
return False
|
| 358 |
-
|
| 359 |
-
@classmethod
|
| 360 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 361 |
-
return {InputFormat.MD}
|
| 362 |
-
|
| 363 |
-
def convert(self) -> DoclingDocument:
|
| 364 |
-
_log.debug("converting Markdown...")
|
| 365 |
-
|
| 366 |
-
origin = DocumentOrigin(
|
| 367 |
-
filename=self.file.name or "file",
|
| 368 |
-
mimetype="text/markdown",
|
| 369 |
-
binary_hash=self.document_hash,
|
| 370 |
-
)
|
| 371 |
-
|
| 372 |
-
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
| 373 |
-
|
| 374 |
-
if self.is_valid():
|
| 375 |
-
# Parse the markdown into an abstract syntax tree (AST)
|
| 376 |
-
marko_parser = Markdown()
|
| 377 |
-
parsed_ast = marko_parser.parse(self.markdown)
|
| 378 |
-
# Start iterating from the root of the AST
|
| 379 |
-
self._iterate_elements(
|
| 380 |
-
element=parsed_ast,
|
| 381 |
-
depth=0,
|
| 382 |
-
doc=doc,
|
| 383 |
-
parent_item=None,
|
| 384 |
-
visited=set(),
|
| 385 |
-
)
|
| 386 |
-
self._process_inline_text(None, doc) # handle last hanging inline text
|
| 387 |
-
self._close_table(doc=doc) # handle any last hanging table
|
| 388 |
-
|
| 389 |
-
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
| 390 |
-
if self._html_blocks > 0:
|
| 391 |
-
|
| 392 |
-
# export to HTML
|
| 393 |
-
html_backend_cls = HTMLDocumentBackend
|
| 394 |
-
html_str = doc.export_to_html()
|
| 395 |
-
|
| 396 |
-
def _restore_original_html(txt, regex):
|
| 397 |
-
_txt, count = re.subn(regex, "", txt)
|
| 398 |
-
if count != self._html_blocks:
|
| 399 |
-
raise RuntimeError(
|
| 400 |
-
"An internal error has occurred during Markdown conversion."
|
| 401 |
-
)
|
| 402 |
-
return _txt
|
| 403 |
-
|
| 404 |
-
# restore original HTML by removing previouly added markers
|
| 405 |
-
for regex in [
|
| 406 |
-
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
| 407 |
-
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
| 408 |
-
]:
|
| 409 |
-
html_str = _restore_original_html(txt=html_str, regex=regex)
|
| 410 |
-
self._html_blocks = 0
|
| 411 |
-
|
| 412 |
-
# delegate to HTML backend
|
| 413 |
-
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
| 414 |
-
in_doc = InputDocument(
|
| 415 |
-
path_or_stream=stream,
|
| 416 |
-
format=InputFormat.HTML,
|
| 417 |
-
backend=html_backend_cls,
|
| 418 |
-
filename=self.file.name,
|
| 419 |
-
)
|
| 420 |
-
html_backend_obj = html_backend_cls(
|
| 421 |
-
in_doc=in_doc, path_or_stream=stream
|
| 422 |
-
)
|
| 423 |
-
doc = html_backend_obj.convert()
|
| 424 |
-
else:
|
| 425 |
-
raise RuntimeError(
|
| 426 |
-
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
| 427 |
-
)
|
| 428 |
-
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/msexcel_backend.py
DELETED
|
@@ -1,386 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Dict, Set, Tuple, Union
|
| 5 |
-
|
| 6 |
-
from docling_core.types.doc import (
|
| 7 |
-
DoclingDocument,
|
| 8 |
-
DocumentOrigin,
|
| 9 |
-
GroupLabel,
|
| 10 |
-
ImageRef,
|
| 11 |
-
TableCell,
|
| 12 |
-
TableData,
|
| 13 |
-
)
|
| 14 |
-
|
| 15 |
-
# from lxml import etree
|
| 16 |
-
from openpyxl import Workbook, load_workbook
|
| 17 |
-
from openpyxl.cell.cell import Cell
|
| 18 |
-
from openpyxl.drawing.image import Image
|
| 19 |
-
from openpyxl.worksheet.worksheet import Worksheet
|
| 20 |
-
|
| 21 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 22 |
-
from docling.datamodel.base_models import InputFormat
|
| 23 |
-
from docling.datamodel.document import InputDocument
|
| 24 |
-
|
| 25 |
-
_log = logging.getLogger(__name__)
|
| 26 |
-
|
| 27 |
-
from typing import Any, List
|
| 28 |
-
|
| 29 |
-
from PIL import Image as PILImage
|
| 30 |
-
from pydantic import BaseModel
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class ExcelCell(BaseModel):
|
| 34 |
-
row: int
|
| 35 |
-
col: int
|
| 36 |
-
text: str
|
| 37 |
-
row_span: int
|
| 38 |
-
col_span: int
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
class ExcelTable(BaseModel):
|
| 42 |
-
num_rows: int
|
| 43 |
-
num_cols: int
|
| 44 |
-
data: List[ExcelCell]
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
| 48 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 49 |
-
super().__init__(in_doc, path_or_stream)
|
| 50 |
-
|
| 51 |
-
# Initialise the parents for the hierarchy
|
| 52 |
-
self.max_levels = 10
|
| 53 |
-
|
| 54 |
-
self.parents: Dict[int, Any] = {}
|
| 55 |
-
for i in range(-1, self.max_levels):
|
| 56 |
-
self.parents[i] = None
|
| 57 |
-
|
| 58 |
-
self.workbook = None
|
| 59 |
-
try:
|
| 60 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 61 |
-
self.workbook = load_workbook(filename=self.path_or_stream)
|
| 62 |
-
|
| 63 |
-
elif isinstance(self.path_or_stream, Path):
|
| 64 |
-
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
| 65 |
-
|
| 66 |
-
self.valid = True
|
| 67 |
-
except Exception as e:
|
| 68 |
-
self.valid = False
|
| 69 |
-
|
| 70 |
-
raise RuntimeError(
|
| 71 |
-
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
| 72 |
-
) from e
|
| 73 |
-
|
| 74 |
-
def is_valid(self) -> bool:
|
| 75 |
-
_log.info(f"valid: {self.valid}")
|
| 76 |
-
return self.valid
|
| 77 |
-
|
| 78 |
-
@classmethod
|
| 79 |
-
def supports_pagination(cls) -> bool:
|
| 80 |
-
return True
|
| 81 |
-
|
| 82 |
-
def unload(self):
|
| 83 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 84 |
-
self.path_or_stream.close()
|
| 85 |
-
|
| 86 |
-
self.path_or_stream = None
|
| 87 |
-
|
| 88 |
-
@classmethod
|
| 89 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 90 |
-
return {InputFormat.XLSX}
|
| 91 |
-
|
| 92 |
-
def convert(self) -> DoclingDocument:
|
| 93 |
-
# Parses the XLSX into a structured document model.
|
| 94 |
-
|
| 95 |
-
origin = DocumentOrigin(
|
| 96 |
-
filename=self.file.name or "file.xlsx",
|
| 97 |
-
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 98 |
-
binary_hash=self.document_hash,
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
-
doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
|
| 102 |
-
|
| 103 |
-
if self.is_valid():
|
| 104 |
-
doc = self._convert_workbook(doc)
|
| 105 |
-
else:
|
| 106 |
-
raise RuntimeError(
|
| 107 |
-
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
return doc
|
| 111 |
-
|
| 112 |
-
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
| 113 |
-
|
| 114 |
-
if self.workbook is not None:
|
| 115 |
-
|
| 116 |
-
# Iterate over all sheets
|
| 117 |
-
for sheet_name in self.workbook.sheetnames:
|
| 118 |
-
_log.info(f"Processing sheet: {sheet_name}")
|
| 119 |
-
|
| 120 |
-
# Access the sheet by name
|
| 121 |
-
sheet = self.workbook[sheet_name]
|
| 122 |
-
|
| 123 |
-
self.parents[0] = doc.add_group(
|
| 124 |
-
parent=None,
|
| 125 |
-
label=GroupLabel.SECTION,
|
| 126 |
-
name=f"sheet: {sheet_name}",
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
doc = self._convert_sheet(doc, sheet)
|
| 130 |
-
else:
|
| 131 |
-
_log.error("Workbook is not initialized.")
|
| 132 |
-
|
| 133 |
-
return doc
|
| 134 |
-
|
| 135 |
-
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
| 136 |
-
|
| 137 |
-
doc = self._find_tables_in_sheet(doc, sheet)
|
| 138 |
-
|
| 139 |
-
doc = self._find_images_in_sheet(doc, sheet)
|
| 140 |
-
|
| 141 |
-
return doc
|
| 142 |
-
|
| 143 |
-
def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
| 144 |
-
|
| 145 |
-
tables = self._find_data_tables(sheet)
|
| 146 |
-
|
| 147 |
-
for excel_table in tables:
|
| 148 |
-
num_rows = excel_table.num_rows
|
| 149 |
-
num_cols = excel_table.num_cols
|
| 150 |
-
|
| 151 |
-
table_data = TableData(
|
| 152 |
-
num_rows=num_rows,
|
| 153 |
-
num_cols=num_cols,
|
| 154 |
-
table_cells=[],
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
for excel_cell in excel_table.data:
|
| 158 |
-
|
| 159 |
-
cell = TableCell(
|
| 160 |
-
text=excel_cell.text,
|
| 161 |
-
row_span=excel_cell.row_span,
|
| 162 |
-
col_span=excel_cell.col_span,
|
| 163 |
-
start_row_offset_idx=excel_cell.row,
|
| 164 |
-
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
| 165 |
-
start_col_offset_idx=excel_cell.col,
|
| 166 |
-
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
| 167 |
-
col_header=False,
|
| 168 |
-
row_header=False,
|
| 169 |
-
)
|
| 170 |
-
table_data.table_cells.append(cell)
|
| 171 |
-
|
| 172 |
-
doc.add_table(data=table_data, parent=self.parents[0])
|
| 173 |
-
|
| 174 |
-
return doc
|
| 175 |
-
|
| 176 |
-
def _find_data_tables(self, sheet: Worksheet):
|
| 177 |
-
"""
|
| 178 |
-
Find all compact rectangular data tables in a sheet.
|
| 179 |
-
"""
|
| 180 |
-
# _log.info("find_data_tables")
|
| 181 |
-
|
| 182 |
-
tables = [] # List to store found tables
|
| 183 |
-
visited: set[Tuple[int, int]] = set() # Track already visited cells
|
| 184 |
-
|
| 185 |
-
# Iterate over all cells in the sheet
|
| 186 |
-
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
| 187 |
-
for rj, cell in enumerate(row):
|
| 188 |
-
|
| 189 |
-
# Skip empty or already visited cells
|
| 190 |
-
if cell.value is None or (ri, rj) in visited:
|
| 191 |
-
continue
|
| 192 |
-
|
| 193 |
-
# If the cell starts a new table, find its bounds
|
| 194 |
-
table_bounds, visited_cells = self._find_table_bounds(
|
| 195 |
-
sheet, ri, rj, visited
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
-
visited.update(visited_cells) # Mark these cells as visited
|
| 199 |
-
tables.append(table_bounds)
|
| 200 |
-
|
| 201 |
-
return tables
|
| 202 |
-
|
| 203 |
-
def _find_table_bounds(
|
| 204 |
-
self,
|
| 205 |
-
sheet: Worksheet,
|
| 206 |
-
start_row: int,
|
| 207 |
-
start_col: int,
|
| 208 |
-
visited: set[Tuple[int, int]],
|
| 209 |
-
):
|
| 210 |
-
"""
|
| 211 |
-
Determine the bounds of a compact rectangular table.
|
| 212 |
-
Returns:
|
| 213 |
-
- A dictionary with the bounds and data.
|
| 214 |
-
- A set of visited cell coordinates.
|
| 215 |
-
"""
|
| 216 |
-
_log.info("find_table_bounds")
|
| 217 |
-
|
| 218 |
-
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
| 219 |
-
max_col = self._find_table_right(sheet, start_row, start_col)
|
| 220 |
-
|
| 221 |
-
# Collect the data within the bounds
|
| 222 |
-
data = []
|
| 223 |
-
visited_cells = set()
|
| 224 |
-
for ri in range(start_row, max_row + 1):
|
| 225 |
-
for rj in range(start_col, max_col + 1):
|
| 226 |
-
|
| 227 |
-
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
| 228 |
-
|
| 229 |
-
# Check if the cell belongs to a merged range
|
| 230 |
-
row_span = 1
|
| 231 |
-
col_span = 1
|
| 232 |
-
|
| 233 |
-
# _log.info(sheet.merged_cells.ranges)
|
| 234 |
-
for merged_range in sheet.merged_cells.ranges:
|
| 235 |
-
|
| 236 |
-
if (
|
| 237 |
-
merged_range.min_row <= ri + 1
|
| 238 |
-
and ri + 1 <= merged_range.max_row
|
| 239 |
-
and merged_range.min_col <= rj + 1
|
| 240 |
-
and rj + 1 <= merged_range.max_col
|
| 241 |
-
):
|
| 242 |
-
|
| 243 |
-
row_span = merged_range.max_row - merged_range.min_row + 1
|
| 244 |
-
col_span = merged_range.max_col - merged_range.min_col + 1
|
| 245 |
-
break
|
| 246 |
-
|
| 247 |
-
if (ri, rj) not in visited_cells:
|
| 248 |
-
data.append(
|
| 249 |
-
ExcelCell(
|
| 250 |
-
row=ri - start_row,
|
| 251 |
-
col=rj - start_col,
|
| 252 |
-
text=str(cell.value),
|
| 253 |
-
row_span=row_span,
|
| 254 |
-
col_span=col_span,
|
| 255 |
-
)
|
| 256 |
-
)
|
| 257 |
-
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
|
| 258 |
-
|
| 259 |
-
# Mark all cells in the span as visited
|
| 260 |
-
for span_row in range(ri, ri + row_span):
|
| 261 |
-
for span_col in range(rj, rj + col_span):
|
| 262 |
-
visited_cells.add((span_row, span_col))
|
| 263 |
-
|
| 264 |
-
return (
|
| 265 |
-
ExcelTable(
|
| 266 |
-
num_rows=max_row + 1 - start_row,
|
| 267 |
-
num_cols=max_col + 1 - start_col,
|
| 268 |
-
data=data,
|
| 269 |
-
),
|
| 270 |
-
visited_cells,
|
| 271 |
-
)
|
| 272 |
-
|
| 273 |
-
def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
|
| 274 |
-
"""Function to find the bottom boundary of the table"""
|
| 275 |
-
|
| 276 |
-
max_row = start_row
|
| 277 |
-
|
| 278 |
-
while max_row < sheet.max_row - 1:
|
| 279 |
-
# Get the cell value or check if it is part of a merged cell
|
| 280 |
-
cell = sheet.cell(row=max_row + 2, column=start_col + 1)
|
| 281 |
-
|
| 282 |
-
# Check if the cell is part of a merged range
|
| 283 |
-
merged_range = next(
|
| 284 |
-
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
| 285 |
-
None,
|
| 286 |
-
)
|
| 287 |
-
|
| 288 |
-
if cell.value is None and not merged_range:
|
| 289 |
-
break # Stop if the cell is empty and not merged
|
| 290 |
-
|
| 291 |
-
# Expand max_row to include the merged range if applicable
|
| 292 |
-
if merged_range:
|
| 293 |
-
max_row = max(max_row, merged_range.max_row - 1)
|
| 294 |
-
else:
|
| 295 |
-
max_row += 1
|
| 296 |
-
|
| 297 |
-
return max_row
|
| 298 |
-
|
| 299 |
-
def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
|
| 300 |
-
"""Function to find the right boundary of the table"""
|
| 301 |
-
|
| 302 |
-
max_col = start_col
|
| 303 |
-
|
| 304 |
-
while max_col < sheet.max_column - 1:
|
| 305 |
-
# Get the cell value or check if it is part of a merged cell
|
| 306 |
-
cell = sheet.cell(row=start_row + 1, column=max_col + 2)
|
| 307 |
-
|
| 308 |
-
# Check if the cell is part of a merged range
|
| 309 |
-
merged_range = next(
|
| 310 |
-
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
| 311 |
-
None,
|
| 312 |
-
)
|
| 313 |
-
|
| 314 |
-
if cell.value is None and not merged_range:
|
| 315 |
-
break # Stop if the cell is empty and not merged
|
| 316 |
-
|
| 317 |
-
# Expand max_col to include the merged range if applicable
|
| 318 |
-
if merged_range:
|
| 319 |
-
max_col = max(max_col, merged_range.max_col - 1)
|
| 320 |
-
else:
|
| 321 |
-
max_col += 1
|
| 322 |
-
|
| 323 |
-
return max_col
|
| 324 |
-
|
| 325 |
-
def _find_images_in_sheet(
|
| 326 |
-
self, doc: DoclingDocument, sheet: Worksheet
|
| 327 |
-
) -> DoclingDocument:
|
| 328 |
-
|
| 329 |
-
# Iterate over byte images in the sheet
|
| 330 |
-
for idx, image in enumerate(sheet._images): # type: ignore
|
| 331 |
-
|
| 332 |
-
try:
|
| 333 |
-
pil_image = PILImage.open(image.ref)
|
| 334 |
-
|
| 335 |
-
doc.add_picture(
|
| 336 |
-
parent=self.parents[0],
|
| 337 |
-
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
| 338 |
-
caption=None,
|
| 339 |
-
)
|
| 340 |
-
except:
|
| 341 |
-
_log.error("could not extract the image from excel sheets")
|
| 342 |
-
|
| 343 |
-
"""
|
| 344 |
-
for idx, chart in enumerate(sheet._charts): # type: ignore
|
| 345 |
-
try:
|
| 346 |
-
chart_path = f"chart_{idx + 1}.png"
|
| 347 |
-
_log.info(
|
| 348 |
-
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
| 349 |
-
)
|
| 350 |
-
|
| 351 |
-
_log.info(f"Chart {idx + 1}:")
|
| 352 |
-
|
| 353 |
-
# Chart type
|
| 354 |
-
# _log.info(f"Type: {type(chart).__name__}")
|
| 355 |
-
print(f"Type: {type(chart).__name__}")
|
| 356 |
-
|
| 357 |
-
# Extract series data
|
| 358 |
-
for series_idx, series in enumerate(chart.series):
|
| 359 |
-
#_log.info(f"Series {series_idx + 1}:")
|
| 360 |
-
print(f"Series {series_idx + 1} type: {type(series).__name__}")
|
| 361 |
-
#print(f"x-values: {series.xVal}")
|
| 362 |
-
#print(f"y-values: {series.yVal}")
|
| 363 |
-
|
| 364 |
-
print(f"xval type: {type(series.xVal).__name__}")
|
| 365 |
-
|
| 366 |
-
xvals = []
|
| 367 |
-
for _ in series.xVal.numLit.pt:
|
| 368 |
-
print(f"xval type: {type(_).__name__}")
|
| 369 |
-
if hasattr(_, 'v'):
|
| 370 |
-
xvals.append(_.v)
|
| 371 |
-
|
| 372 |
-
print(f"x-values: {xvals}")
|
| 373 |
-
|
| 374 |
-
yvals = []
|
| 375 |
-
for _ in series.yVal:
|
| 376 |
-
if hasattr(_, 'v'):
|
| 377 |
-
yvals.append(_.v)
|
| 378 |
-
|
| 379 |
-
print(f"y-values: {yvals}")
|
| 380 |
-
|
| 381 |
-
except Exception as exc:
|
| 382 |
-
print(exc)
|
| 383 |
-
continue
|
| 384 |
-
"""
|
| 385 |
-
|
| 386 |
-
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/mspowerpoint_backend.py
DELETED
|
@@ -1,424 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Set, Union
|
| 5 |
-
|
| 6 |
-
from docling_core.types.doc import (
|
| 7 |
-
BoundingBox,
|
| 8 |
-
CoordOrigin,
|
| 9 |
-
DocItemLabel,
|
| 10 |
-
DoclingDocument,
|
| 11 |
-
DocumentOrigin,
|
| 12 |
-
GroupLabel,
|
| 13 |
-
ImageRef,
|
| 14 |
-
ProvenanceItem,
|
| 15 |
-
Size,
|
| 16 |
-
TableCell,
|
| 17 |
-
TableData,
|
| 18 |
-
)
|
| 19 |
-
from PIL import Image, UnidentifiedImageError
|
| 20 |
-
from pptx import Presentation
|
| 21 |
-
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
| 22 |
-
|
| 23 |
-
from docling.backend.abstract_backend import (
|
| 24 |
-
DeclarativeDocumentBackend,
|
| 25 |
-
PaginatedDocumentBackend,
|
| 26 |
-
)
|
| 27 |
-
from docling.datamodel.base_models import InputFormat
|
| 28 |
-
from docling.datamodel.document import InputDocument
|
| 29 |
-
|
| 30 |
-
_log = logging.getLogger(__name__)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
| 34 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 35 |
-
super().__init__(in_doc, path_or_stream)
|
| 36 |
-
self.namespaces = {
|
| 37 |
-
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
| 38 |
-
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
| 39 |
-
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
| 40 |
-
}
|
| 41 |
-
# Powerpoint file:
|
| 42 |
-
self.path_or_stream = path_or_stream
|
| 43 |
-
|
| 44 |
-
self.pptx_obj = None
|
| 45 |
-
self.valid = False
|
| 46 |
-
try:
|
| 47 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 48 |
-
self.pptx_obj = Presentation(self.path_or_stream)
|
| 49 |
-
elif isinstance(self.path_or_stream, Path):
|
| 50 |
-
self.pptx_obj = Presentation(str(self.path_or_stream))
|
| 51 |
-
|
| 52 |
-
self.valid = True
|
| 53 |
-
except Exception as e:
|
| 54 |
-
raise RuntimeError(
|
| 55 |
-
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
| 56 |
-
) from e
|
| 57 |
-
|
| 58 |
-
return
|
| 59 |
-
|
| 60 |
-
def page_count(self) -> int:
|
| 61 |
-
if self.is_valid():
|
| 62 |
-
assert self.pptx_obj is not None
|
| 63 |
-
return len(self.pptx_obj.slides)
|
| 64 |
-
else:
|
| 65 |
-
return 0
|
| 66 |
-
|
| 67 |
-
def is_valid(self) -> bool:
|
| 68 |
-
return self.valid
|
| 69 |
-
|
| 70 |
-
@classmethod
|
| 71 |
-
def supports_pagination(cls) -> bool:
|
| 72 |
-
return True # True? if so, how to handle pages...
|
| 73 |
-
|
| 74 |
-
def unload(self):
|
| 75 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 76 |
-
self.path_or_stream.close()
|
| 77 |
-
|
| 78 |
-
self.path_or_stream = None
|
| 79 |
-
|
| 80 |
-
@classmethod
|
| 81 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 82 |
-
return {InputFormat.PPTX}
|
| 83 |
-
|
| 84 |
-
def convert(self) -> DoclingDocument:
|
| 85 |
-
# Parses the PPTX into a structured document model.
|
| 86 |
-
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
| 87 |
-
|
| 88 |
-
origin = DocumentOrigin(
|
| 89 |
-
filename=self.file.name or "file",
|
| 90 |
-
mimetype="application/vnd.ms-powerpoint",
|
| 91 |
-
binary_hash=self.document_hash,
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
doc = DoclingDocument(
|
| 95 |
-
name=self.file.stem or "file", origin=origin
|
| 96 |
-
) # must add origin information
|
| 97 |
-
doc = self.walk_linear(self.pptx_obj, doc)
|
| 98 |
-
|
| 99 |
-
return doc
|
| 100 |
-
|
| 101 |
-
def generate_prov(
|
| 102 |
-
self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
|
| 103 |
-
):
|
| 104 |
-
if shape.left:
|
| 105 |
-
left = shape.left
|
| 106 |
-
top = shape.top
|
| 107 |
-
width = shape.width
|
| 108 |
-
height = shape.height
|
| 109 |
-
else:
|
| 110 |
-
left = 0
|
| 111 |
-
top = 0
|
| 112 |
-
width = slide_size.width
|
| 113 |
-
height = slide_size.height
|
| 114 |
-
shape_bbox = [left, top, left + width, top + height]
|
| 115 |
-
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
| 116 |
-
prov = ProvenanceItem(
|
| 117 |
-
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
return prov
|
| 121 |
-
|
| 122 |
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
| 123 |
-
is_a_list = False
|
| 124 |
-
is_list_group_created = False
|
| 125 |
-
enum_list_item_value = 0
|
| 126 |
-
new_list = None
|
| 127 |
-
bullet_type = "None"
|
| 128 |
-
list_text = ""
|
| 129 |
-
list_label = GroupLabel.LIST
|
| 130 |
-
doc_label = DocItemLabel.LIST_ITEM
|
| 131 |
-
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
| 132 |
-
|
| 133 |
-
# Identify if shape contains lists
|
| 134 |
-
for paragraph in shape.text_frame.paragraphs:
|
| 135 |
-
# Check if paragraph is a bullet point using the `element` XML
|
| 136 |
-
p = paragraph._element
|
| 137 |
-
if (
|
| 138 |
-
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
| 139 |
-
is not None
|
| 140 |
-
):
|
| 141 |
-
bullet_type = "Bullet"
|
| 142 |
-
is_a_list = True
|
| 143 |
-
elif (
|
| 144 |
-
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
| 145 |
-
is not None
|
| 146 |
-
):
|
| 147 |
-
bullet_type = "Numbered"
|
| 148 |
-
is_a_list = True
|
| 149 |
-
else:
|
| 150 |
-
is_a_list = False
|
| 151 |
-
|
| 152 |
-
if paragraph.level > 0:
|
| 153 |
-
# Most likely a sub-list
|
| 154 |
-
is_a_list = True
|
| 155 |
-
|
| 156 |
-
if is_a_list:
|
| 157 |
-
# Determine if this is an unordered list or an ordered list.
|
| 158 |
-
# Set GroupLabel.ORDERED_LIST when it fits.
|
| 159 |
-
if bullet_type == "Numbered":
|
| 160 |
-
list_label = GroupLabel.ORDERED_LIST
|
| 161 |
-
|
| 162 |
-
if is_a_list:
|
| 163 |
-
_log.debug("LIST DETECTED!")
|
| 164 |
-
else:
|
| 165 |
-
_log.debug("No List")
|
| 166 |
-
|
| 167 |
-
# If there is a list inside of the shape, create a new docling list to assign list items to
|
| 168 |
-
# if is_a_list:
|
| 169 |
-
# new_list = doc.add_group(
|
| 170 |
-
# label=list_label, name=f"list", parent=parent_slide
|
| 171 |
-
# )
|
| 172 |
-
|
| 173 |
-
# Iterate through paragraphs to build up text
|
| 174 |
-
for paragraph in shape.text_frame.paragraphs:
|
| 175 |
-
# p_text = paragraph.text.strip()
|
| 176 |
-
p = paragraph._element
|
| 177 |
-
enum_list_item_value += 1
|
| 178 |
-
inline_paragraph_text = ""
|
| 179 |
-
inline_list_item_text = ""
|
| 180 |
-
|
| 181 |
-
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
| 182 |
-
if len(e.text.strip()) > 0:
|
| 183 |
-
e_is_a_list_item = False
|
| 184 |
-
is_numbered = False
|
| 185 |
-
if (
|
| 186 |
-
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
| 187 |
-
is not None
|
| 188 |
-
):
|
| 189 |
-
bullet_type = "Bullet"
|
| 190 |
-
e_is_a_list_item = True
|
| 191 |
-
elif (
|
| 192 |
-
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
| 193 |
-
is not None
|
| 194 |
-
):
|
| 195 |
-
bullet_type = "Numbered"
|
| 196 |
-
is_numbered = True
|
| 197 |
-
e_is_a_list_item = True
|
| 198 |
-
else:
|
| 199 |
-
e_is_a_list_item = False
|
| 200 |
-
|
| 201 |
-
if e_is_a_list_item:
|
| 202 |
-
if len(inline_paragraph_text) > 0:
|
| 203 |
-
# output accumulated inline text:
|
| 204 |
-
doc.add_text(
|
| 205 |
-
label=doc_label,
|
| 206 |
-
parent=parent_slide,
|
| 207 |
-
text=inline_paragraph_text,
|
| 208 |
-
prov=prov,
|
| 209 |
-
)
|
| 210 |
-
# Set marker and enumerated arguments if this is an enumeration element.
|
| 211 |
-
inline_list_item_text += e.text
|
| 212 |
-
# print(e.text)
|
| 213 |
-
else:
|
| 214 |
-
# Assign proper label to the text, depending if it's a Title or Section Header
|
| 215 |
-
# For other types of text, assign - PARAGRAPH
|
| 216 |
-
doc_label = DocItemLabel.PARAGRAPH
|
| 217 |
-
if shape.is_placeholder:
|
| 218 |
-
placeholder_type = shape.placeholder_format.type
|
| 219 |
-
if placeholder_type in [
|
| 220 |
-
PP_PLACEHOLDER.CENTER_TITLE,
|
| 221 |
-
PP_PLACEHOLDER.TITLE,
|
| 222 |
-
]:
|
| 223 |
-
# It's a title
|
| 224 |
-
doc_label = DocItemLabel.TITLE
|
| 225 |
-
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
| 226 |
-
DocItemLabel.SECTION_HEADER
|
| 227 |
-
enum_list_item_value = 0
|
| 228 |
-
inline_paragraph_text += e.text
|
| 229 |
-
|
| 230 |
-
if len(inline_paragraph_text) > 0:
|
| 231 |
-
# output accumulated inline text:
|
| 232 |
-
doc.add_text(
|
| 233 |
-
label=doc_label,
|
| 234 |
-
parent=parent_slide,
|
| 235 |
-
text=inline_paragraph_text,
|
| 236 |
-
prov=prov,
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
if len(inline_list_item_text) > 0:
|
| 240 |
-
enum_marker = ""
|
| 241 |
-
if is_numbered:
|
| 242 |
-
enum_marker = str(enum_list_item_value) + "."
|
| 243 |
-
if not is_list_group_created:
|
| 244 |
-
new_list = doc.add_group(
|
| 245 |
-
label=list_label, name=f"list", parent=parent_slide
|
| 246 |
-
)
|
| 247 |
-
is_list_group_created = True
|
| 248 |
-
doc.add_list_item(
|
| 249 |
-
marker=enum_marker,
|
| 250 |
-
enumerated=is_numbered,
|
| 251 |
-
parent=new_list,
|
| 252 |
-
text=inline_list_item_text,
|
| 253 |
-
prov=prov,
|
| 254 |
-
)
|
| 255 |
-
return
|
| 256 |
-
|
| 257 |
-
def handle_title(self, shape, parent_slide, slide_ind, doc):
|
| 258 |
-
placeholder_type = shape.placeholder_format.type
|
| 259 |
-
txt = shape.text.strip()
|
| 260 |
-
prov = self.generate_prov(shape, slide_ind, txt)
|
| 261 |
-
|
| 262 |
-
if len(txt.strip()) > 0:
|
| 263 |
-
# title = slide.shapes.title.text if slide.shapes.title else "No title"
|
| 264 |
-
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
|
| 265 |
-
_log.info(f"Title found: {shape.text}")
|
| 266 |
-
doc.add_text(
|
| 267 |
-
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
|
| 268 |
-
)
|
| 269 |
-
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
| 270 |
-
_log.info(f"Subtitle found: {shape.text}")
|
| 271 |
-
# Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
|
| 272 |
-
doc.add_text(
|
| 273 |
-
label=DocItemLabel.SECTION_HEADER,
|
| 274 |
-
parent=parent_slide,
|
| 275 |
-
text=txt,
|
| 276 |
-
prov=prov,
|
| 277 |
-
)
|
| 278 |
-
return
|
| 279 |
-
|
| 280 |
-
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
|
| 281 |
-
# Open it with PIL
|
| 282 |
-
try:
|
| 283 |
-
# Get the image bytes
|
| 284 |
-
image = shape.image
|
| 285 |
-
image_bytes = image.blob
|
| 286 |
-
im_dpi, _ = image.dpi
|
| 287 |
-
pil_image = Image.open(BytesIO(image_bytes))
|
| 288 |
-
|
| 289 |
-
# shape has picture
|
| 290 |
-
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
| 291 |
-
doc.add_picture(
|
| 292 |
-
parent=parent_slide,
|
| 293 |
-
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
| 294 |
-
caption=None,
|
| 295 |
-
prov=prov,
|
| 296 |
-
)
|
| 297 |
-
except (UnidentifiedImageError, OSError) as e:
|
| 298 |
-
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
| 299 |
-
return
|
| 300 |
-
|
| 301 |
-
def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
|
| 302 |
-
# Handling tables, images, charts
|
| 303 |
-
if shape.has_table:
|
| 304 |
-
table = shape.table
|
| 305 |
-
table_xml = shape._element
|
| 306 |
-
|
| 307 |
-
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
| 308 |
-
|
| 309 |
-
num_cols = 0
|
| 310 |
-
num_rows = len(table.rows)
|
| 311 |
-
tcells = []
|
| 312 |
-
# Access the XML element for the shape that contains the table
|
| 313 |
-
table_xml = shape._element
|
| 314 |
-
|
| 315 |
-
for row_idx, row in enumerate(table.rows):
|
| 316 |
-
if len(row.cells) > num_cols:
|
| 317 |
-
num_cols = len(row.cells)
|
| 318 |
-
for col_idx, cell in enumerate(row.cells):
|
| 319 |
-
# Access the XML of the cell (this is the 'tc' element in table XML)
|
| 320 |
-
cell_xml = table_xml.xpath(
|
| 321 |
-
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
|
| 322 |
-
)
|
| 323 |
-
|
| 324 |
-
if not cell_xml:
|
| 325 |
-
continue # If no cell XML is found, skip
|
| 326 |
-
|
| 327 |
-
cell_xml = cell_xml[0] # Get the first matching XML node
|
| 328 |
-
row_span = cell_xml.get("rowSpan") # Vertical span
|
| 329 |
-
col_span = cell_xml.get("gridSpan") # Horizontal span
|
| 330 |
-
|
| 331 |
-
if row_span is None:
|
| 332 |
-
row_span = 1
|
| 333 |
-
else:
|
| 334 |
-
row_span = int(row_span)
|
| 335 |
-
|
| 336 |
-
if col_span is None:
|
| 337 |
-
col_span = 1
|
| 338 |
-
else:
|
| 339 |
-
col_span = int(col_span)
|
| 340 |
-
|
| 341 |
-
icell = TableCell(
|
| 342 |
-
text=cell.text.strip(),
|
| 343 |
-
row_span=row_span,
|
| 344 |
-
col_span=col_span,
|
| 345 |
-
start_row_offset_idx=row_idx,
|
| 346 |
-
end_row_offset_idx=row_idx + row_span,
|
| 347 |
-
start_col_offset_idx=col_idx,
|
| 348 |
-
end_col_offset_idx=col_idx + col_span,
|
| 349 |
-
col_header=False,
|
| 350 |
-
row_header=False,
|
| 351 |
-
)
|
| 352 |
-
if len(cell.text.strip()) > 0:
|
| 353 |
-
tcells.append(icell)
|
| 354 |
-
# Initialize Docling TableData
|
| 355 |
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
| 356 |
-
# Populate
|
| 357 |
-
for tcell in tcells:
|
| 358 |
-
data.table_cells.append(tcell)
|
| 359 |
-
if len(tcells) > 0:
|
| 360 |
-
# If table is not fully empty...
|
| 361 |
-
# Create Docling table
|
| 362 |
-
doc.add_table(parent=parent_slide, data=data, prov=prov)
|
| 363 |
-
return
|
| 364 |
-
|
| 365 |
-
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
| 366 |
-
# Units of size in PPTX by default are EMU units (English Metric Units)
|
| 367 |
-
slide_width = pptx_obj.slide_width
|
| 368 |
-
slide_height = pptx_obj.slide_height
|
| 369 |
-
|
| 370 |
-
text_content = [] # type: ignore
|
| 371 |
-
|
| 372 |
-
max_levels = 10
|
| 373 |
-
parents = {} # type: ignore
|
| 374 |
-
for i in range(0, max_levels):
|
| 375 |
-
parents[i] = None
|
| 376 |
-
|
| 377 |
-
# Loop through each slide
|
| 378 |
-
for slide_num, slide in enumerate(pptx_obj.slides):
|
| 379 |
-
slide_ind = pptx_obj.slides.index(slide)
|
| 380 |
-
parent_slide = doc.add_group(
|
| 381 |
-
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
| 382 |
-
)
|
| 383 |
-
|
| 384 |
-
slide_size = Size(width=slide_width, height=slide_height)
|
| 385 |
-
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
| 386 |
-
|
| 387 |
-
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
| 388 |
-
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
| 389 |
-
if shape.has_table:
|
| 390 |
-
# Handle Tables
|
| 391 |
-
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
| 392 |
-
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
| 393 |
-
# Handle Pictures
|
| 394 |
-
self.handle_pictures(
|
| 395 |
-
shape, parent_slide, slide_ind, doc, slide_size
|
| 396 |
-
)
|
| 397 |
-
# If shape doesn't have any text, move on to the next shape
|
| 398 |
-
if not hasattr(shape, "text"):
|
| 399 |
-
return
|
| 400 |
-
if shape.text is None:
|
| 401 |
-
return
|
| 402 |
-
if len(shape.text.strip()) == 0:
|
| 403 |
-
return
|
| 404 |
-
if not shape.has_text_frame:
|
| 405 |
-
_log.warning("Warning: shape has text but not text_frame")
|
| 406 |
-
return
|
| 407 |
-
# Handle other text elements, including lists (bullet lists, numbered lists)
|
| 408 |
-
self.handle_text_elements(
|
| 409 |
-
shape, parent_slide, slide_ind, doc, slide_size
|
| 410 |
-
)
|
| 411 |
-
return
|
| 412 |
-
|
| 413 |
-
def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
|
| 414 |
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
| 415 |
-
for groupedshape in shape.shapes:
|
| 416 |
-
handle_shapes(
|
| 417 |
-
groupedshape, parent_slide, slide_ind, doc, slide_size
|
| 418 |
-
)
|
| 419 |
-
|
| 420 |
-
# Loop through each shape in the slide
|
| 421 |
-
for shape in slide.shapes:
|
| 422 |
-
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
| 423 |
-
|
| 424 |
-
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/msword_backend.py
DELETED
|
@@ -1,582 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import re
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Any, Optional, Union
|
| 6 |
-
|
| 7 |
-
from docling_core.types.doc import (
|
| 8 |
-
DocItemLabel,
|
| 9 |
-
DoclingDocument,
|
| 10 |
-
DocumentOrigin,
|
| 11 |
-
GroupLabel,
|
| 12 |
-
ImageRef,
|
| 13 |
-
NodeItem,
|
| 14 |
-
TableCell,
|
| 15 |
-
TableData,
|
| 16 |
-
)
|
| 17 |
-
from docx import Document
|
| 18 |
-
from docx.document import Document as DocxDocument
|
| 19 |
-
from docx.oxml.table import CT_Tc
|
| 20 |
-
from docx.oxml.xmlchemy import BaseOxmlElement
|
| 21 |
-
from docx.table import Table, _Cell
|
| 22 |
-
from docx.text.paragraph import Paragraph
|
| 23 |
-
from lxml import etree
|
| 24 |
-
from lxml.etree import XPath
|
| 25 |
-
from PIL import Image, UnidentifiedImageError
|
| 26 |
-
from typing_extensions import override
|
| 27 |
-
|
| 28 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 29 |
-
from docling.datamodel.base_models import InputFormat
|
| 30 |
-
from docling.datamodel.document import InputDocument
|
| 31 |
-
|
| 32 |
-
_log = logging.getLogger(__name__)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
| 36 |
-
@override
|
| 37 |
-
def __init__(
|
| 38 |
-
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
| 39 |
-
) -> None:
|
| 40 |
-
super().__init__(in_doc, path_or_stream)
|
| 41 |
-
self.XML_KEY = (
|
| 42 |
-
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
| 43 |
-
)
|
| 44 |
-
self.xml_namespaces = {
|
| 45 |
-
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
| 46 |
-
}
|
| 47 |
-
# self.initialise(path_or_stream)
|
| 48 |
-
# Word file:
|
| 49 |
-
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
| 50 |
-
self.valid: bool = False
|
| 51 |
-
# Initialise the parents for the hierarchy
|
| 52 |
-
self.max_levels: int = 10
|
| 53 |
-
self.level_at_new_list: Optional[int] = None
|
| 54 |
-
self.parents: dict[int, Optional[NodeItem]] = {}
|
| 55 |
-
for i in range(-1, self.max_levels):
|
| 56 |
-
self.parents[i] = None
|
| 57 |
-
|
| 58 |
-
self.level = 0
|
| 59 |
-
self.listIter = 0
|
| 60 |
-
|
| 61 |
-
self.history: dict[str, Any] = {
|
| 62 |
-
"names": [None],
|
| 63 |
-
"levels": [None],
|
| 64 |
-
"numids": [None],
|
| 65 |
-
"indents": [None],
|
| 66 |
-
}
|
| 67 |
-
|
| 68 |
-
self.docx_obj = None
|
| 69 |
-
try:
|
| 70 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 71 |
-
self.docx_obj = Document(self.path_or_stream)
|
| 72 |
-
elif isinstance(self.path_or_stream, Path):
|
| 73 |
-
self.docx_obj = Document(str(self.path_or_stream))
|
| 74 |
-
|
| 75 |
-
self.valid = True
|
| 76 |
-
except Exception as e:
|
| 77 |
-
raise RuntimeError(
|
| 78 |
-
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
| 79 |
-
) from e
|
| 80 |
-
|
| 81 |
-
@override
|
| 82 |
-
def is_valid(self) -> bool:
|
| 83 |
-
return self.valid
|
| 84 |
-
|
| 85 |
-
@classmethod
|
| 86 |
-
@override
|
| 87 |
-
def supports_pagination(cls) -> bool:
|
| 88 |
-
return False
|
| 89 |
-
|
| 90 |
-
@override
|
| 91 |
-
def unload(self):
|
| 92 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 93 |
-
self.path_or_stream.close()
|
| 94 |
-
|
| 95 |
-
self.path_or_stream = None
|
| 96 |
-
|
| 97 |
-
@classmethod
|
| 98 |
-
@override
|
| 99 |
-
def supported_formats(cls) -> set[InputFormat]:
|
| 100 |
-
return {InputFormat.DOCX}
|
| 101 |
-
|
| 102 |
-
@override
|
| 103 |
-
def convert(self) -> DoclingDocument:
|
| 104 |
-
"""Parses the DOCX into a structured document model.
|
| 105 |
-
|
| 106 |
-
Returns:
|
| 107 |
-
The parsed document.
|
| 108 |
-
"""
|
| 109 |
-
|
| 110 |
-
origin = DocumentOrigin(
|
| 111 |
-
filename=self.file.name or "file",
|
| 112 |
-
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 113 |
-
binary_hash=self.document_hash,
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
| 117 |
-
if self.is_valid():
|
| 118 |
-
assert self.docx_obj is not None
|
| 119 |
-
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
| 120 |
-
return doc
|
| 121 |
-
else:
|
| 122 |
-
raise RuntimeError(
|
| 123 |
-
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
def update_history(
|
| 127 |
-
self,
|
| 128 |
-
name: str,
|
| 129 |
-
level: Optional[int],
|
| 130 |
-
numid: Optional[int],
|
| 131 |
-
ilevel: Optional[int],
|
| 132 |
-
):
|
| 133 |
-
self.history["names"].append(name)
|
| 134 |
-
self.history["levels"].append(level)
|
| 135 |
-
|
| 136 |
-
self.history["numids"].append(numid)
|
| 137 |
-
self.history["indents"].append(ilevel)
|
| 138 |
-
|
| 139 |
-
def prev_name(self) -> Optional[str]:
|
| 140 |
-
return self.history["names"][-1]
|
| 141 |
-
|
| 142 |
-
def prev_level(self) -> Optional[int]:
|
| 143 |
-
return self.history["levels"][-1]
|
| 144 |
-
|
| 145 |
-
def prev_numid(self) -> Optional[int]:
|
| 146 |
-
return self.history["numids"][-1]
|
| 147 |
-
|
| 148 |
-
def prev_indent(self) -> Optional[int]:
|
| 149 |
-
return self.history["indents"][-1]
|
| 150 |
-
|
| 151 |
-
def get_level(self) -> int:
|
| 152 |
-
"""Return the first None index."""
|
| 153 |
-
for k, v in self.parents.items():
|
| 154 |
-
if k >= 0 and v == None:
|
| 155 |
-
return k
|
| 156 |
-
return 0
|
| 157 |
-
|
| 158 |
-
def walk_linear(
|
| 159 |
-
self,
|
| 160 |
-
body: BaseOxmlElement,
|
| 161 |
-
docx_obj: DocxDocument,
|
| 162 |
-
doc: DoclingDocument,
|
| 163 |
-
) -> DoclingDocument:
|
| 164 |
-
for element in body:
|
| 165 |
-
tag_name = etree.QName(element).localname
|
| 166 |
-
# Check for Inline Images (blip elements)
|
| 167 |
-
namespaces = {
|
| 168 |
-
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
| 169 |
-
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
| 170 |
-
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
| 171 |
-
}
|
| 172 |
-
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
| 173 |
-
drawing_blip = xpath_expr(element)
|
| 174 |
-
|
| 175 |
-
# Check for Tables
|
| 176 |
-
if element.tag.endswith("tbl"):
|
| 177 |
-
try:
|
| 178 |
-
self.handle_tables(element, docx_obj, doc)
|
| 179 |
-
except Exception:
|
| 180 |
-
_log.debug("could not parse a table, broken docx table")
|
| 181 |
-
|
| 182 |
-
elif drawing_blip:
|
| 183 |
-
self.handle_pictures(docx_obj, drawing_blip, doc)
|
| 184 |
-
# Check for the sdt containers, like table of contents
|
| 185 |
-
elif tag_name in ["sdt"]:
|
| 186 |
-
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
| 187 |
-
if sdt_content is not None:
|
| 188 |
-
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
| 189 |
-
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
| 190 |
-
for p in paragraphs:
|
| 191 |
-
self.handle_text_elements(p, docx_obj, doc)
|
| 192 |
-
# Check for Text
|
| 193 |
-
elif tag_name in ["p"]:
|
| 194 |
-
# "tcPr", "sectPr"
|
| 195 |
-
self.handle_text_elements(element, docx_obj, doc)
|
| 196 |
-
else:
|
| 197 |
-
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
| 198 |
-
return doc
|
| 199 |
-
|
| 200 |
-
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
|
| 201 |
-
if s is None:
|
| 202 |
-
return None
|
| 203 |
-
try:
|
| 204 |
-
return int(s)
|
| 205 |
-
except ValueError:
|
| 206 |
-
return default
|
| 207 |
-
|
| 208 |
-
def split_text_and_number(self, input_string: str) -> list[str]:
|
| 209 |
-
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
| 210 |
-
if match:
|
| 211 |
-
parts = list(filter(None, match.groups()))
|
| 212 |
-
return parts
|
| 213 |
-
else:
|
| 214 |
-
return [input_string]
|
| 215 |
-
|
| 216 |
-
def get_numId_and_ilvl(
|
| 217 |
-
self, paragraph: Paragraph
|
| 218 |
-
) -> tuple[Optional[int], Optional[int]]:
|
| 219 |
-
# Access the XML element of the paragraph
|
| 220 |
-
numPr = paragraph._element.find(
|
| 221 |
-
".//w:numPr", namespaces=paragraph._element.nsmap
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
if numPr is not None:
|
| 225 |
-
# Get the numId element and extract the value
|
| 226 |
-
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
|
| 227 |
-
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
|
| 228 |
-
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
| 229 |
-
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
| 230 |
-
|
| 231 |
-
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
|
| 232 |
-
|
| 233 |
-
return None, None # If the paragraph is not part of a list
|
| 234 |
-
|
| 235 |
-
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
| 236 |
-
if paragraph.style is None:
|
| 237 |
-
return "Normal", None
|
| 238 |
-
label = paragraph.style.style_id
|
| 239 |
-
if label is None:
|
| 240 |
-
return "Normal", None
|
| 241 |
-
if ":" in label:
|
| 242 |
-
parts = label.split(":")
|
| 243 |
-
|
| 244 |
-
if len(parts) == 2:
|
| 245 |
-
return parts[0], self.str_to_int(parts[1], None)
|
| 246 |
-
|
| 247 |
-
parts = self.split_text_and_number(label)
|
| 248 |
-
|
| 249 |
-
if "Heading" in label and len(parts) == 2:
|
| 250 |
-
parts.sort()
|
| 251 |
-
label_str: str = ""
|
| 252 |
-
label_level: Optional[int] = 0
|
| 253 |
-
if parts[0] == "Heading":
|
| 254 |
-
label_str = parts[0]
|
| 255 |
-
label_level = self.str_to_int(parts[1], None)
|
| 256 |
-
if parts[1] == "Heading":
|
| 257 |
-
label_str = parts[1]
|
| 258 |
-
label_level = self.str_to_int(parts[0], None)
|
| 259 |
-
return label_str, label_level
|
| 260 |
-
else:
|
| 261 |
-
return label, None
|
| 262 |
-
|
| 263 |
-
def handle_text_elements(
|
| 264 |
-
self,
|
| 265 |
-
element: BaseOxmlElement,
|
| 266 |
-
docx_obj: DocxDocument,
|
| 267 |
-
doc: DoclingDocument,
|
| 268 |
-
) -> None:
|
| 269 |
-
paragraph = Paragraph(element, docx_obj)
|
| 270 |
-
|
| 271 |
-
if paragraph.text is None:
|
| 272 |
-
return
|
| 273 |
-
text = paragraph.text.strip()
|
| 274 |
-
|
| 275 |
-
# Common styles for bullet and numbered lists.
|
| 276 |
-
# "List Bullet", "List Number", "List Paragraph"
|
| 277 |
-
# Identify wether list is a numbered list or not
|
| 278 |
-
# is_numbered = "List Bullet" not in paragraph.style.name
|
| 279 |
-
is_numbered = False
|
| 280 |
-
p_style_id, p_level = self.get_label_and_level(paragraph)
|
| 281 |
-
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
| 282 |
-
|
| 283 |
-
if numid == 0:
|
| 284 |
-
numid = None
|
| 285 |
-
|
| 286 |
-
# Handle lists
|
| 287 |
-
if (
|
| 288 |
-
numid is not None
|
| 289 |
-
and ilevel is not None
|
| 290 |
-
and p_style_id not in ["Title", "Heading"]
|
| 291 |
-
):
|
| 292 |
-
self.add_listitem(
|
| 293 |
-
doc,
|
| 294 |
-
numid,
|
| 295 |
-
ilevel,
|
| 296 |
-
text,
|
| 297 |
-
is_numbered,
|
| 298 |
-
)
|
| 299 |
-
self.update_history(p_style_id, p_level, numid, ilevel)
|
| 300 |
-
return
|
| 301 |
-
elif (
|
| 302 |
-
numid is None
|
| 303 |
-
and self.prev_numid() is not None
|
| 304 |
-
and p_style_id not in ["Title", "Heading"]
|
| 305 |
-
): # Close list
|
| 306 |
-
if self.level_at_new_list:
|
| 307 |
-
for key in range(len(self.parents)):
|
| 308 |
-
if key >= self.level_at_new_list:
|
| 309 |
-
self.parents[key] = None
|
| 310 |
-
self.level = self.level_at_new_list - 1
|
| 311 |
-
self.level_at_new_list = None
|
| 312 |
-
else:
|
| 313 |
-
for key in range(len(self.parents)):
|
| 314 |
-
self.parents[key] = None
|
| 315 |
-
self.level = 0
|
| 316 |
-
|
| 317 |
-
if p_style_id in ["Title"]:
|
| 318 |
-
for key in range(len(self.parents)):
|
| 319 |
-
self.parents[key] = None
|
| 320 |
-
self.parents[0] = doc.add_text(
|
| 321 |
-
parent=None, label=DocItemLabel.TITLE, text=text
|
| 322 |
-
)
|
| 323 |
-
elif "Heading" in p_style_id:
|
| 324 |
-
self.add_header(doc, p_level, text)
|
| 325 |
-
|
| 326 |
-
elif p_style_id in [
|
| 327 |
-
"Paragraph",
|
| 328 |
-
"Normal",
|
| 329 |
-
"Subtitle",
|
| 330 |
-
"Author",
|
| 331 |
-
"DefaultText",
|
| 332 |
-
"ListParagraph",
|
| 333 |
-
"ListBullet",
|
| 334 |
-
"Quote",
|
| 335 |
-
]:
|
| 336 |
-
level = self.get_level()
|
| 337 |
-
doc.add_text(
|
| 338 |
-
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
| 339 |
-
)
|
| 340 |
-
|
| 341 |
-
else:
|
| 342 |
-
# Text style names can, and will have, not only default values but user values too
|
| 343 |
-
# hence we treat all other labels as pure text
|
| 344 |
-
level = self.get_level()
|
| 345 |
-
doc.add_text(
|
| 346 |
-
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
| 347 |
-
)
|
| 348 |
-
|
| 349 |
-
self.update_history(p_style_id, p_level, numid, ilevel)
|
| 350 |
-
return
|
| 351 |
-
|
| 352 |
-
def add_header(
|
| 353 |
-
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
| 354 |
-
) -> None:
|
| 355 |
-
level = self.get_level()
|
| 356 |
-
if isinstance(curr_level, int):
|
| 357 |
-
if curr_level > level:
|
| 358 |
-
# add invisible group
|
| 359 |
-
for i in range(level, curr_level):
|
| 360 |
-
self.parents[i] = doc.add_group(
|
| 361 |
-
parent=self.parents[i - 1],
|
| 362 |
-
label=GroupLabel.SECTION,
|
| 363 |
-
name=f"header-{i}",
|
| 364 |
-
)
|
| 365 |
-
elif curr_level < level:
|
| 366 |
-
# remove the tail
|
| 367 |
-
for key in range(len(self.parents)):
|
| 368 |
-
if key >= curr_level:
|
| 369 |
-
self.parents[key] = None
|
| 370 |
-
|
| 371 |
-
self.parents[curr_level] = doc.add_heading(
|
| 372 |
-
parent=self.parents[curr_level - 1],
|
| 373 |
-
text=text,
|
| 374 |
-
level=curr_level,
|
| 375 |
-
)
|
| 376 |
-
else:
|
| 377 |
-
self.parents[self.level] = doc.add_heading(
|
| 378 |
-
parent=self.parents[self.level - 1],
|
| 379 |
-
text=text,
|
| 380 |
-
level=1,
|
| 381 |
-
)
|
| 382 |
-
return
|
| 383 |
-
|
| 384 |
-
def add_listitem(
|
| 385 |
-
self,
|
| 386 |
-
doc: DoclingDocument,
|
| 387 |
-
numid: int,
|
| 388 |
-
ilevel: int,
|
| 389 |
-
text: str,
|
| 390 |
-
is_numbered: bool = False,
|
| 391 |
-
) -> None:
|
| 392 |
-
enum_marker = ""
|
| 393 |
-
|
| 394 |
-
level = self.get_level()
|
| 395 |
-
prev_indent = self.prev_indent()
|
| 396 |
-
if self.prev_numid() is None: # Open new list
|
| 397 |
-
self.level_at_new_list = level
|
| 398 |
-
|
| 399 |
-
self.parents[level] = doc.add_group(
|
| 400 |
-
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
| 401 |
-
)
|
| 402 |
-
|
| 403 |
-
# Set marker and enumerated arguments if this is an enumeration element.
|
| 404 |
-
self.listIter += 1
|
| 405 |
-
if is_numbered:
|
| 406 |
-
enum_marker = str(self.listIter) + "."
|
| 407 |
-
is_numbered = True
|
| 408 |
-
doc.add_list_item(
|
| 409 |
-
marker=enum_marker,
|
| 410 |
-
enumerated=is_numbered,
|
| 411 |
-
parent=self.parents[level],
|
| 412 |
-
text=text,
|
| 413 |
-
)
|
| 414 |
-
|
| 415 |
-
elif (
|
| 416 |
-
self.prev_numid() == numid
|
| 417 |
-
and self.level_at_new_list is not None
|
| 418 |
-
and prev_indent is not None
|
| 419 |
-
and prev_indent < ilevel
|
| 420 |
-
): # Open indented list
|
| 421 |
-
for i in range(
|
| 422 |
-
self.level_at_new_list + prev_indent + 1,
|
| 423 |
-
self.level_at_new_list + ilevel + 1,
|
| 424 |
-
):
|
| 425 |
-
# Determine if this is an unordered list or an ordered list.
|
| 426 |
-
# Set GroupLabel.ORDERED_LIST when it fits.
|
| 427 |
-
self.listIter = 0
|
| 428 |
-
if is_numbered:
|
| 429 |
-
self.parents[i] = doc.add_group(
|
| 430 |
-
label=GroupLabel.ORDERED_LIST,
|
| 431 |
-
name="list",
|
| 432 |
-
parent=self.parents[i - 1],
|
| 433 |
-
)
|
| 434 |
-
else:
|
| 435 |
-
self.parents[i] = doc.add_group(
|
| 436 |
-
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
| 437 |
-
)
|
| 438 |
-
|
| 439 |
-
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
| 440 |
-
self.listIter += 1
|
| 441 |
-
if is_numbered:
|
| 442 |
-
enum_marker = str(self.listIter) + "."
|
| 443 |
-
is_numbered = True
|
| 444 |
-
doc.add_list_item(
|
| 445 |
-
marker=enum_marker,
|
| 446 |
-
enumerated=is_numbered,
|
| 447 |
-
parent=self.parents[self.level_at_new_list + ilevel],
|
| 448 |
-
text=text,
|
| 449 |
-
)
|
| 450 |
-
|
| 451 |
-
elif (
|
| 452 |
-
self.prev_numid() == numid
|
| 453 |
-
and self.level_at_new_list is not None
|
| 454 |
-
and prev_indent is not None
|
| 455 |
-
and ilevel < prev_indent
|
| 456 |
-
): # Close list
|
| 457 |
-
for k, v in self.parents.items():
|
| 458 |
-
if k > self.level_at_new_list + ilevel:
|
| 459 |
-
self.parents[k] = None
|
| 460 |
-
|
| 461 |
-
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
| 462 |
-
self.listIter += 1
|
| 463 |
-
if is_numbered:
|
| 464 |
-
enum_marker = str(self.listIter) + "."
|
| 465 |
-
is_numbered = True
|
| 466 |
-
doc.add_list_item(
|
| 467 |
-
marker=enum_marker,
|
| 468 |
-
enumerated=is_numbered,
|
| 469 |
-
parent=self.parents[self.level_at_new_list + ilevel],
|
| 470 |
-
text=text,
|
| 471 |
-
)
|
| 472 |
-
self.listIter = 0
|
| 473 |
-
|
| 474 |
-
elif self.prev_numid() == numid or prev_indent == ilevel:
|
| 475 |
-
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
| 476 |
-
self.listIter += 1
|
| 477 |
-
if is_numbered:
|
| 478 |
-
enum_marker = str(self.listIter) + "."
|
| 479 |
-
is_numbered = True
|
| 480 |
-
doc.add_list_item(
|
| 481 |
-
marker=enum_marker,
|
| 482 |
-
enumerated=is_numbered,
|
| 483 |
-
parent=self.parents[level - 1],
|
| 484 |
-
text=text,
|
| 485 |
-
)
|
| 486 |
-
return
|
| 487 |
-
|
| 488 |
-
def handle_tables(
|
| 489 |
-
self,
|
| 490 |
-
element: BaseOxmlElement,
|
| 491 |
-
docx_obj: DocxDocument,
|
| 492 |
-
doc: DoclingDocument,
|
| 493 |
-
) -> None:
|
| 494 |
-
table: Table = Table(element, docx_obj)
|
| 495 |
-
num_rows = len(table.rows)
|
| 496 |
-
num_cols = len(table.columns)
|
| 497 |
-
_log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
|
| 498 |
-
|
| 499 |
-
if num_rows == 1 and num_cols == 1:
|
| 500 |
-
cell_element = table.rows[0].cells[0]
|
| 501 |
-
# In case we have a table of only 1 cell, we consider it furniture
|
| 502 |
-
# And proceed processing the content of the cell as though it's in the document body
|
| 503 |
-
self.walk_linear(cell_element._element, docx_obj, doc)
|
| 504 |
-
return
|
| 505 |
-
|
| 506 |
-
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
| 507 |
-
cell_set: set[CT_Tc] = set()
|
| 508 |
-
for row_idx, row in enumerate(table.rows):
|
| 509 |
-
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
| 510 |
-
col_idx = 0
|
| 511 |
-
while col_idx < num_cols:
|
| 512 |
-
cell: _Cell = row.cells[col_idx]
|
| 513 |
-
_log.debug(
|
| 514 |
-
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
| 515 |
-
)
|
| 516 |
-
if cell is None or cell._tc in cell_set:
|
| 517 |
-
_log.debug(f" skipped since repeated content")
|
| 518 |
-
col_idx += cell.grid_span
|
| 519 |
-
continue
|
| 520 |
-
else:
|
| 521 |
-
cell_set.add(cell._tc)
|
| 522 |
-
|
| 523 |
-
spanned_idx = row_idx
|
| 524 |
-
spanned_tc: Optional[CT_Tc] = cell._tc
|
| 525 |
-
while spanned_tc == cell._tc:
|
| 526 |
-
spanned_idx += 1
|
| 527 |
-
spanned_tc = (
|
| 528 |
-
table.rows[spanned_idx].cells[col_idx]._tc
|
| 529 |
-
if spanned_idx < num_rows
|
| 530 |
-
else None
|
| 531 |
-
)
|
| 532 |
-
_log.debug(f" spanned before row {spanned_idx}")
|
| 533 |
-
|
| 534 |
-
table_cell = TableCell(
|
| 535 |
-
text=cell.text,
|
| 536 |
-
row_span=spanned_idx - row_idx,
|
| 537 |
-
col_span=cell.grid_span,
|
| 538 |
-
start_row_offset_idx=row.grid_cols_before + row_idx,
|
| 539 |
-
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
| 540 |
-
start_col_offset_idx=col_idx,
|
| 541 |
-
end_col_offset_idx=col_idx + cell.grid_span,
|
| 542 |
-
col_header=False,
|
| 543 |
-
row_header=False,
|
| 544 |
-
)
|
| 545 |
-
data.table_cells.append(table_cell)
|
| 546 |
-
col_idx += cell.grid_span
|
| 547 |
-
|
| 548 |
-
level = self.get_level()
|
| 549 |
-
doc.add_table(data=data, parent=self.parents[level - 1])
|
| 550 |
-
return
|
| 551 |
-
|
| 552 |
-
def handle_pictures(
|
| 553 |
-
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
| 554 |
-
) -> None:
|
| 555 |
-
def get_docx_image(drawing_blip):
|
| 556 |
-
rId = drawing_blip[0].get(
|
| 557 |
-
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
| 558 |
-
)
|
| 559 |
-
if rId in docx_obj.part.rels:
|
| 560 |
-
# Access the image part using the relationship ID
|
| 561 |
-
image_part = docx_obj.part.rels[rId].target_part
|
| 562 |
-
image_data = image_part.blob # Get the binary image data
|
| 563 |
-
return image_data
|
| 564 |
-
|
| 565 |
-
level = self.get_level()
|
| 566 |
-
# Open the BytesIO object with PIL to create an Image
|
| 567 |
-
try:
|
| 568 |
-
image_data = get_docx_image(drawing_blip)
|
| 569 |
-
image_bytes = BytesIO(image_data)
|
| 570 |
-
pil_image = Image.open(image_bytes)
|
| 571 |
-
doc.add_picture(
|
| 572 |
-
parent=self.parents[level - 1],
|
| 573 |
-
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
| 574 |
-
caption=None,
|
| 575 |
-
)
|
| 576 |
-
except (UnidentifiedImageError, OSError) as e:
|
| 577 |
-
_log.warning("Warning: image cannot be loaded by Pillow")
|
| 578 |
-
doc.add_picture(
|
| 579 |
-
parent=self.parents[level - 1],
|
| 580 |
-
caption=None,
|
| 581 |
-
)
|
| 582 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/pdf_backend.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
| 1 |
-
from abc import ABC, abstractmethod
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Iterable, Optional, Set, Union
|
| 5 |
-
|
| 6 |
-
from docling_core.types.doc import BoundingBox, Size
|
| 7 |
-
from PIL import Image
|
| 8 |
-
|
| 9 |
-
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
| 10 |
-
from docling.datamodel.base_models import Cell, InputFormat
|
| 11 |
-
from docling.datamodel.document import InputDocument
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class PdfPageBackend(ABC):
|
| 15 |
-
@abstractmethod
|
| 16 |
-
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
| 17 |
-
pass
|
| 18 |
-
|
| 19 |
-
@abstractmethod
|
| 20 |
-
def get_text_cells(self) -> Iterable[Cell]:
|
| 21 |
-
pass
|
| 22 |
-
|
| 23 |
-
@abstractmethod
|
| 24 |
-
def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
|
| 25 |
-
pass
|
| 26 |
-
|
| 27 |
-
@abstractmethod
|
| 28 |
-
def get_page_image(
|
| 29 |
-
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
| 30 |
-
) -> Image.Image:
|
| 31 |
-
pass
|
| 32 |
-
|
| 33 |
-
@abstractmethod
|
| 34 |
-
def get_size(self) -> Size:
|
| 35 |
-
pass
|
| 36 |
-
|
| 37 |
-
@abstractmethod
|
| 38 |
-
def is_valid(self) -> bool:
|
| 39 |
-
pass
|
| 40 |
-
|
| 41 |
-
@abstractmethod
|
| 42 |
-
def unload(self):
|
| 43 |
-
pass
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
class PdfDocumentBackend(PaginatedDocumentBackend):
|
| 47 |
-
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
| 48 |
-
super().__init__(in_doc, path_or_stream)
|
| 49 |
-
|
| 50 |
-
if self.input_format is not InputFormat.PDF:
|
| 51 |
-
if self.input_format is InputFormat.IMAGE:
|
| 52 |
-
buf = BytesIO()
|
| 53 |
-
img = Image.open(self.path_or_stream)
|
| 54 |
-
img.save(buf, "PDF")
|
| 55 |
-
buf.seek(0)
|
| 56 |
-
self.path_or_stream = buf
|
| 57 |
-
else:
|
| 58 |
-
raise RuntimeError(
|
| 59 |
-
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
@abstractmethod
|
| 63 |
-
def load_page(self, page_no: int) -> PdfPageBackend:
|
| 64 |
-
pass
|
| 65 |
-
|
| 66 |
-
@abstractmethod
|
| 67 |
-
def page_count(self) -> int:
|
| 68 |
-
pass
|
| 69 |
-
|
| 70 |
-
@classmethod
|
| 71 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 72 |
-
return {InputFormat.PDF}
|
| 73 |
-
|
| 74 |
-
@classmethod
|
| 75 |
-
def supports_pagination(cls) -> bool:
|
| 76 |
-
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/pypdfium2_backend.py
DELETED
|
@@ -1,260 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import random
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
| 6 |
-
|
| 7 |
-
import pypdfium2 as pdfium
|
| 8 |
-
import pypdfium2.raw as pdfium_c
|
| 9 |
-
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
| 10 |
-
from PIL import Image, ImageDraw
|
| 11 |
-
from pypdfium2 import PdfTextPage
|
| 12 |
-
from pypdfium2._helpers.misc import PdfiumError
|
| 13 |
-
|
| 14 |
-
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
| 15 |
-
from docling.datamodel.base_models import Cell
|
| 16 |
-
|
| 17 |
-
if TYPE_CHECKING:
|
| 18 |
-
from docling.datamodel.document import InputDocument
|
| 19 |
-
|
| 20 |
-
_log = logging.getLogger(__name__)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
class PyPdfiumPageBackend(PdfPageBackend):
|
| 24 |
-
def __init__(
|
| 25 |
-
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
| 26 |
-
):
|
| 27 |
-
self.valid = True # No better way to tell from pypdfium.
|
| 28 |
-
try:
|
| 29 |
-
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
| 30 |
-
except PdfiumError as e:
|
| 31 |
-
_log.info(
|
| 32 |
-
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
| 33 |
-
exc_info=True,
|
| 34 |
-
)
|
| 35 |
-
self.valid = False
|
| 36 |
-
self.text_page: Optional[PdfTextPage] = None
|
| 37 |
-
|
| 38 |
-
def is_valid(self) -> bool:
|
| 39 |
-
return self.valid
|
| 40 |
-
|
| 41 |
-
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
| 42 |
-
AREA_THRESHOLD = 0 # 32 * 32
|
| 43 |
-
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
| 44 |
-
pos = obj.get_pos()
|
| 45 |
-
cropbox = BoundingBox.from_tuple(
|
| 46 |
-
pos, origin=CoordOrigin.BOTTOMLEFT
|
| 47 |
-
).to_top_left_origin(page_height=self.get_size().height)
|
| 48 |
-
|
| 49 |
-
if cropbox.area() > AREA_THRESHOLD:
|
| 50 |
-
cropbox = cropbox.scaled(scale=scale)
|
| 51 |
-
|
| 52 |
-
yield cropbox
|
| 53 |
-
|
| 54 |
-
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
| 55 |
-
if not self.text_page:
|
| 56 |
-
self.text_page = self._ppage.get_textpage()
|
| 57 |
-
|
| 58 |
-
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
| 59 |
-
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
| 60 |
-
|
| 61 |
-
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
| 62 |
-
|
| 63 |
-
return text_piece
|
| 64 |
-
|
| 65 |
-
def get_text_cells(self) -> Iterable[Cell]:
|
| 66 |
-
if not self.text_page:
|
| 67 |
-
self.text_page = self._ppage.get_textpage()
|
| 68 |
-
|
| 69 |
-
cells = []
|
| 70 |
-
cell_counter = 0
|
| 71 |
-
|
| 72 |
-
page_size = self.get_size()
|
| 73 |
-
|
| 74 |
-
for i in range(self.text_page.count_rects()):
|
| 75 |
-
rect = self.text_page.get_rect(i)
|
| 76 |
-
text_piece = self.text_page.get_text_bounded(*rect)
|
| 77 |
-
x0, y0, x1, y1 = rect
|
| 78 |
-
cells.append(
|
| 79 |
-
Cell(
|
| 80 |
-
id=cell_counter,
|
| 81 |
-
text=text_piece,
|
| 82 |
-
bbox=BoundingBox(
|
| 83 |
-
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
| 84 |
-
).to_top_left_origin(page_size.height),
|
| 85 |
-
)
|
| 86 |
-
)
|
| 87 |
-
cell_counter += 1
|
| 88 |
-
|
| 89 |
-
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
| 90 |
-
# The cell merging code below is to clean this up.
|
| 91 |
-
def merge_horizontal_cells(
|
| 92 |
-
cells: List[Cell],
|
| 93 |
-
horizontal_threshold_factor: float = 1.0,
|
| 94 |
-
vertical_threshold_factor: float = 0.5,
|
| 95 |
-
) -> List[Cell]:
|
| 96 |
-
if not cells:
|
| 97 |
-
return []
|
| 98 |
-
|
| 99 |
-
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
|
| 100 |
-
rows = []
|
| 101 |
-
current_row = [cells[0]]
|
| 102 |
-
row_top = cells[0].bbox.t
|
| 103 |
-
row_bottom = cells[0].bbox.b
|
| 104 |
-
row_height = cells[0].bbox.height
|
| 105 |
-
|
| 106 |
-
for cell in cells[1:]:
|
| 107 |
-
vertical_threshold = row_height * vertical_threshold_factor
|
| 108 |
-
if (
|
| 109 |
-
abs(cell.bbox.t - row_top) <= vertical_threshold
|
| 110 |
-
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
|
| 111 |
-
):
|
| 112 |
-
current_row.append(cell)
|
| 113 |
-
row_top = min(row_top, cell.bbox.t)
|
| 114 |
-
row_bottom = max(row_bottom, cell.bbox.b)
|
| 115 |
-
row_height = row_bottom - row_top
|
| 116 |
-
else:
|
| 117 |
-
rows.append(current_row)
|
| 118 |
-
current_row = [cell]
|
| 119 |
-
row_top = cell.bbox.t
|
| 120 |
-
row_bottom = cell.bbox.b
|
| 121 |
-
row_height = cell.bbox.height
|
| 122 |
-
|
| 123 |
-
if current_row:
|
| 124 |
-
rows.append(current_row)
|
| 125 |
-
|
| 126 |
-
return rows
|
| 127 |
-
|
| 128 |
-
def merge_row(row: List[Cell]) -> List[Cell]:
|
| 129 |
-
merged = []
|
| 130 |
-
current_group = [row[0]]
|
| 131 |
-
|
| 132 |
-
for cell in row[1:]:
|
| 133 |
-
prev_cell = current_group[-1]
|
| 134 |
-
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
|
| 135 |
-
if (
|
| 136 |
-
cell.bbox.l - prev_cell.bbox.r
|
| 137 |
-
<= avg_height * horizontal_threshold_factor
|
| 138 |
-
):
|
| 139 |
-
current_group.append(cell)
|
| 140 |
-
else:
|
| 141 |
-
merged.append(merge_group(current_group))
|
| 142 |
-
current_group = [cell]
|
| 143 |
-
|
| 144 |
-
if current_group:
|
| 145 |
-
merged.append(merge_group(current_group))
|
| 146 |
-
|
| 147 |
-
return merged
|
| 148 |
-
|
| 149 |
-
def merge_group(group: List[Cell]) -> Cell:
|
| 150 |
-
if len(group) == 1:
|
| 151 |
-
return group[0]
|
| 152 |
-
|
| 153 |
-
merged_text = "".join(cell.text for cell in group)
|
| 154 |
-
merged_bbox = BoundingBox(
|
| 155 |
-
l=min(cell.bbox.l for cell in group),
|
| 156 |
-
t=min(cell.bbox.t for cell in group),
|
| 157 |
-
r=max(cell.bbox.r for cell in group),
|
| 158 |
-
b=max(cell.bbox.b for cell in group),
|
| 159 |
-
)
|
| 160 |
-
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
| 161 |
-
|
| 162 |
-
rows = group_rows(cells)
|
| 163 |
-
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
| 164 |
-
|
| 165 |
-
for i, cell in enumerate(merged_cells, 1):
|
| 166 |
-
cell.id = i
|
| 167 |
-
|
| 168 |
-
return merged_cells
|
| 169 |
-
|
| 170 |
-
def draw_clusters_and_cells():
|
| 171 |
-
image = (
|
| 172 |
-
self.get_page_image()
|
| 173 |
-
) # make new image to avoid drawing on the saved ones
|
| 174 |
-
draw = ImageDraw.Draw(image)
|
| 175 |
-
for c in cells:
|
| 176 |
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
| 177 |
-
cell_color = (
|
| 178 |
-
random.randint(30, 140),
|
| 179 |
-
random.randint(30, 140),
|
| 180 |
-
random.randint(30, 140),
|
| 181 |
-
)
|
| 182 |
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
| 183 |
-
image.show()
|
| 184 |
-
|
| 185 |
-
# before merge:
|
| 186 |
-
# draw_clusters_and_cells()
|
| 187 |
-
|
| 188 |
-
cells = merge_horizontal_cells(cells)
|
| 189 |
-
|
| 190 |
-
# after merge:
|
| 191 |
-
# draw_clusters_and_cells()
|
| 192 |
-
|
| 193 |
-
return cells
|
| 194 |
-
|
| 195 |
-
def get_page_image(
|
| 196 |
-
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
| 197 |
-
) -> Image.Image:
|
| 198 |
-
|
| 199 |
-
page_size = self.get_size()
|
| 200 |
-
|
| 201 |
-
if not cropbox:
|
| 202 |
-
cropbox = BoundingBox(
|
| 203 |
-
l=0,
|
| 204 |
-
r=page_size.width,
|
| 205 |
-
t=0,
|
| 206 |
-
b=page_size.height,
|
| 207 |
-
coord_origin=CoordOrigin.TOPLEFT,
|
| 208 |
-
)
|
| 209 |
-
padbox = BoundingBox(
|
| 210 |
-
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
| 211 |
-
)
|
| 212 |
-
else:
|
| 213 |
-
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
| 214 |
-
padbox.r = page_size.width - padbox.r
|
| 215 |
-
padbox.t = page_size.height - padbox.t
|
| 216 |
-
|
| 217 |
-
image = (
|
| 218 |
-
self._ppage.render(
|
| 219 |
-
scale=scale * 1.5,
|
| 220 |
-
rotation=0, # no additional rotation
|
| 221 |
-
crop=padbox.as_tuple(),
|
| 222 |
-
)
|
| 223 |
-
.to_pil()
|
| 224 |
-
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
| 225 |
-
) # We resize the image from 1.5x the given scale to make it sharper.
|
| 226 |
-
|
| 227 |
-
return image
|
| 228 |
-
|
| 229 |
-
def get_size(self) -> Size:
|
| 230 |
-
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
| 231 |
-
|
| 232 |
-
def unload(self):
|
| 233 |
-
self._ppage = None
|
| 234 |
-
self.text_page = None
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
| 238 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 239 |
-
super().__init__(in_doc, path_or_stream)
|
| 240 |
-
|
| 241 |
-
try:
|
| 242 |
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
| 243 |
-
except PdfiumError as e:
|
| 244 |
-
raise RuntimeError(
|
| 245 |
-
f"pypdfium could not load document with hash {self.document_hash}"
|
| 246 |
-
) from e
|
| 247 |
-
|
| 248 |
-
def page_count(self) -> int:
|
| 249 |
-
return len(self._pdoc)
|
| 250 |
-
|
| 251 |
-
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
| 252 |
-
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
| 253 |
-
|
| 254 |
-
def is_valid(self) -> bool:
|
| 255 |
-
return self.page_count() > 0
|
| 256 |
-
|
| 257 |
-
def unload(self):
|
| 258 |
-
super().unload()
|
| 259 |
-
self._pdoc.close()
|
| 260 |
-
self._pdoc = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/pubmed_backend.py
DELETED
|
@@ -1,592 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
from io import BytesIO
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Any, Set, Union
|
| 5 |
-
|
| 6 |
-
import lxml
|
| 7 |
-
from bs4 import BeautifulSoup
|
| 8 |
-
from docling_core.types.doc import (
|
| 9 |
-
DocItemLabel,
|
| 10 |
-
DoclingDocument,
|
| 11 |
-
DocumentOrigin,
|
| 12 |
-
GroupLabel,
|
| 13 |
-
TableCell,
|
| 14 |
-
TableData,
|
| 15 |
-
)
|
| 16 |
-
from lxml import etree
|
| 17 |
-
from typing_extensions import TypedDict, override
|
| 18 |
-
|
| 19 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 20 |
-
from docling.datamodel.base_models import InputFormat
|
| 21 |
-
from docling.datamodel.document import InputDocument
|
| 22 |
-
|
| 23 |
-
_log = logging.getLogger(__name__)
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
class Paragraph(TypedDict):
|
| 27 |
-
text: str
|
| 28 |
-
headers: list[str]
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class Author(TypedDict):
|
| 32 |
-
name: str
|
| 33 |
-
affiliation_names: list[str]
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
class Table(TypedDict):
|
| 37 |
-
label: str
|
| 38 |
-
caption: str
|
| 39 |
-
content: str
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
class FigureCaption(TypedDict):
|
| 43 |
-
label: str
|
| 44 |
-
caption: str
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
class Reference(TypedDict):
|
| 48 |
-
author_names: str
|
| 49 |
-
title: str
|
| 50 |
-
journal: str
|
| 51 |
-
year: str
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class XMLComponents(TypedDict):
|
| 55 |
-
title: str
|
| 56 |
-
authors: list[Author]
|
| 57 |
-
abstract: str
|
| 58 |
-
paragraphs: list[Paragraph]
|
| 59 |
-
tables: list[Table]
|
| 60 |
-
figure_captions: list[FigureCaption]
|
| 61 |
-
references: list[Reference]
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
| 65 |
-
"""
|
| 66 |
-
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
|
| 67 |
-
Achakulvisut et al., (2020).
|
| 68 |
-
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
|
| 69 |
-
Journal of Open Source Software, 5(46), 1979,
|
| 70 |
-
https://doi.org/10.21105/joss.01979
|
| 71 |
-
"""
|
| 72 |
-
|
| 73 |
-
@override
|
| 74 |
-
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
| 75 |
-
super().__init__(in_doc, path_or_stream)
|
| 76 |
-
self.path_or_stream = path_or_stream
|
| 77 |
-
|
| 78 |
-
# Initialize parents for the document hierarchy
|
| 79 |
-
self.parents: dict = {}
|
| 80 |
-
|
| 81 |
-
self.valid = False
|
| 82 |
-
try:
|
| 83 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 84 |
-
self.path_or_stream.seek(0)
|
| 85 |
-
self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
|
| 86 |
-
if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
|
| 87 |
-
self.valid = True
|
| 88 |
-
except Exception as exc:
|
| 89 |
-
raise RuntimeError(
|
| 90 |
-
f"Could not initialize PubMed backend for file with hash {self.document_hash}."
|
| 91 |
-
) from exc
|
| 92 |
-
|
| 93 |
-
@override
|
| 94 |
-
def is_valid(self) -> bool:
|
| 95 |
-
return self.valid
|
| 96 |
-
|
| 97 |
-
@classmethod
|
| 98 |
-
@override
|
| 99 |
-
def supports_pagination(cls) -> bool:
|
| 100 |
-
return False
|
| 101 |
-
|
| 102 |
-
@override
|
| 103 |
-
def unload(self):
|
| 104 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 105 |
-
self.path_or_stream.close()
|
| 106 |
-
self.path_or_stream = None
|
| 107 |
-
|
| 108 |
-
@classmethod
|
| 109 |
-
@override
|
| 110 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 111 |
-
return {InputFormat.XML_PUBMED}
|
| 112 |
-
|
| 113 |
-
@override
|
| 114 |
-
def convert(self) -> DoclingDocument:
|
| 115 |
-
# Create empty document
|
| 116 |
-
origin = DocumentOrigin(
|
| 117 |
-
filename=self.file.name or "file",
|
| 118 |
-
mimetype="application/xml",
|
| 119 |
-
binary_hash=self.document_hash,
|
| 120 |
-
)
|
| 121 |
-
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
| 122 |
-
|
| 123 |
-
_log.debug("Trying to convert PubMed XML document...")
|
| 124 |
-
|
| 125 |
-
# Get parsed XML components
|
| 126 |
-
xml_components: XMLComponents = self._parse()
|
| 127 |
-
|
| 128 |
-
# Add XML components to the document
|
| 129 |
-
doc = self._populate_document(doc, xml_components)
|
| 130 |
-
return doc
|
| 131 |
-
|
| 132 |
-
def _parse_title(self) -> str:
|
| 133 |
-
title: str = " ".join(
|
| 134 |
-
[
|
| 135 |
-
t.replace("\n", "")
|
| 136 |
-
for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
|
| 137 |
-
]
|
| 138 |
-
)
|
| 139 |
-
return title
|
| 140 |
-
|
| 141 |
-
def _parse_authors(self) -> list[Author]:
|
| 142 |
-
# Get mapping between affiliation ids and names
|
| 143 |
-
affiliation_names = []
|
| 144 |
-
for affiliation_node in self.tree.xpath(".//aff[@id]"):
|
| 145 |
-
affiliation_names.append(
|
| 146 |
-
": ".join([t for t in affiliation_node.itertext() if t != "\n"])
|
| 147 |
-
)
|
| 148 |
-
affiliation_ids_names = {
|
| 149 |
-
id: name
|
| 150 |
-
for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
|
| 151 |
-
}
|
| 152 |
-
|
| 153 |
-
# Get author names and affiliation names
|
| 154 |
-
authors: list[Author] = []
|
| 155 |
-
for author_node in self.tree.xpath(
|
| 156 |
-
'.//contrib-group/contrib[@contrib-type="author"]'
|
| 157 |
-
):
|
| 158 |
-
author: Author = {
|
| 159 |
-
"name": "",
|
| 160 |
-
"affiliation_names": [],
|
| 161 |
-
}
|
| 162 |
-
|
| 163 |
-
# Affiliation names
|
| 164 |
-
affiliation_ids = [
|
| 165 |
-
a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
|
| 166 |
-
]
|
| 167 |
-
for id in affiliation_ids:
|
| 168 |
-
if id in affiliation_ids_names:
|
| 169 |
-
author["affiliation_names"].append(affiliation_ids_names[id])
|
| 170 |
-
|
| 171 |
-
# Name
|
| 172 |
-
author["name"] = (
|
| 173 |
-
author_node.xpath("name/surname")[0].text
|
| 174 |
-
+ " "
|
| 175 |
-
+ author_node.xpath("name/given-names")[0].text
|
| 176 |
-
)
|
| 177 |
-
|
| 178 |
-
authors.append(author)
|
| 179 |
-
return authors
|
| 180 |
-
|
| 181 |
-
def _parse_abstract(self) -> str:
|
| 182 |
-
texts = []
|
| 183 |
-
for abstract_node in self.tree.xpath(".//abstract"):
|
| 184 |
-
for text in abstract_node.itertext():
|
| 185 |
-
texts.append(text.replace("\n", ""))
|
| 186 |
-
abstract: str = "".join(texts)
|
| 187 |
-
return abstract
|
| 188 |
-
|
| 189 |
-
def _parse_main_text(self) -> list[Paragraph]:
|
| 190 |
-
paragraphs: list[Paragraph] = []
|
| 191 |
-
for paragraph_node in self.tree.xpath("//body//p"):
|
| 192 |
-
# Skip captions
|
| 193 |
-
if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
|
| 194 |
-
continue
|
| 195 |
-
|
| 196 |
-
paragraph: Paragraph = {"text": "", "headers": []}
|
| 197 |
-
|
| 198 |
-
# Text
|
| 199 |
-
paragraph["text"] = "".join(
|
| 200 |
-
[t.replace("\n", "") for t in paragraph_node.itertext()]
|
| 201 |
-
)
|
| 202 |
-
|
| 203 |
-
# Header
|
| 204 |
-
path = "../title"
|
| 205 |
-
while len(paragraph_node.xpath(path)) > 0:
|
| 206 |
-
paragraph["headers"].append(
|
| 207 |
-
"".join(
|
| 208 |
-
[
|
| 209 |
-
t.replace("\n", "")
|
| 210 |
-
for t in paragraph_node.xpath(path)[0].itertext()
|
| 211 |
-
]
|
| 212 |
-
)
|
| 213 |
-
)
|
| 214 |
-
path = "../" + path
|
| 215 |
-
|
| 216 |
-
paragraphs.append(paragraph)
|
| 217 |
-
|
| 218 |
-
return paragraphs
|
| 219 |
-
|
| 220 |
-
def _parse_tables(self) -> list[Table]:
|
| 221 |
-
tables: list[Table] = []
|
| 222 |
-
for table_node in self.tree.xpath(".//body//table-wrap"):
|
| 223 |
-
table: Table = {"label": "", "caption": "", "content": ""}
|
| 224 |
-
|
| 225 |
-
# Content
|
| 226 |
-
if len(table_node.xpath("table")) > 0:
|
| 227 |
-
table_content_node = table_node.xpath("table")[0]
|
| 228 |
-
elif len(table_node.xpath("alternatives/table")) > 0:
|
| 229 |
-
table_content_node = table_node.xpath("alternatives/table")[0]
|
| 230 |
-
else:
|
| 231 |
-
table_content_node = None
|
| 232 |
-
if table_content_node != None:
|
| 233 |
-
table["content"] = etree.tostring(table_content_node).decode("utf-8")
|
| 234 |
-
|
| 235 |
-
# Caption
|
| 236 |
-
if len(table_node.xpath("caption/p")) > 0:
|
| 237 |
-
caption_node = table_node.xpath("caption/p")[0]
|
| 238 |
-
elif len(table_node.xpath("caption/title")) > 0:
|
| 239 |
-
caption_node = table_node.xpath("caption/title")[0]
|
| 240 |
-
else:
|
| 241 |
-
caption_node = None
|
| 242 |
-
if caption_node != None:
|
| 243 |
-
table["caption"] = "".join(
|
| 244 |
-
[t.replace("\n", "") for t in caption_node.itertext()]
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
# Label
|
| 248 |
-
if len(table_node.xpath("label")) > 0:
|
| 249 |
-
table["label"] = table_node.xpath("label")[0].text
|
| 250 |
-
|
| 251 |
-
tables.append(table)
|
| 252 |
-
return tables
|
| 253 |
-
|
| 254 |
-
def _parse_figure_captions(self) -> list[FigureCaption]:
|
| 255 |
-
figure_captions: list[FigureCaption] = []
|
| 256 |
-
|
| 257 |
-
if not (self.tree.xpath(".//fig")):
|
| 258 |
-
return figure_captions
|
| 259 |
-
|
| 260 |
-
for figure_node in self.tree.xpath(".//fig"):
|
| 261 |
-
figure_caption: FigureCaption = {
|
| 262 |
-
"caption": "",
|
| 263 |
-
"label": "",
|
| 264 |
-
}
|
| 265 |
-
|
| 266 |
-
# Label
|
| 267 |
-
if figure_node.xpath("label"):
|
| 268 |
-
figure_caption["label"] = "".join(
|
| 269 |
-
[
|
| 270 |
-
t.replace("\n", "")
|
| 271 |
-
for t in figure_node.xpath("label")[0].itertext()
|
| 272 |
-
]
|
| 273 |
-
)
|
| 274 |
-
|
| 275 |
-
# Caption
|
| 276 |
-
if figure_node.xpath("caption"):
|
| 277 |
-
caption = ""
|
| 278 |
-
for caption_node in figure_node.xpath("caption")[0].getchildren():
|
| 279 |
-
caption += (
|
| 280 |
-
"".join([t.replace("\n", "") for t in caption_node.itertext()])
|
| 281 |
-
+ "\n"
|
| 282 |
-
)
|
| 283 |
-
figure_caption["caption"] = caption
|
| 284 |
-
|
| 285 |
-
figure_captions.append(figure_caption)
|
| 286 |
-
|
| 287 |
-
return figure_captions
|
| 288 |
-
|
| 289 |
-
def _parse_references(self) -> list[Reference]:
|
| 290 |
-
references: list[Reference] = []
|
| 291 |
-
for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
|
| 292 |
-
reference: Reference = {
|
| 293 |
-
"author_names": "",
|
| 294 |
-
"title": "",
|
| 295 |
-
"journal": "",
|
| 296 |
-
"year": "",
|
| 297 |
-
}
|
| 298 |
-
reference_node: Any = None
|
| 299 |
-
for tag in ["mixed-citation", "element-citation", "citation"]:
|
| 300 |
-
if len(reference_node_abs.xpath(tag)) > 0:
|
| 301 |
-
reference_node = reference_node_abs.xpath(tag)[0]
|
| 302 |
-
break
|
| 303 |
-
|
| 304 |
-
if reference_node is None:
|
| 305 |
-
continue
|
| 306 |
-
|
| 307 |
-
if all(
|
| 308 |
-
not (ref_type in ["citation-type", "publication-type"])
|
| 309 |
-
for ref_type in reference_node.attrib.keys()
|
| 310 |
-
):
|
| 311 |
-
continue
|
| 312 |
-
|
| 313 |
-
# Author names
|
| 314 |
-
names = []
|
| 315 |
-
if len(reference_node.xpath("name")) > 0:
|
| 316 |
-
for name_node in reference_node.xpath("name"):
|
| 317 |
-
name_str = " ".join(
|
| 318 |
-
[t.text for t in name_node.getchildren() if (t.text != None)]
|
| 319 |
-
)
|
| 320 |
-
names.append(name_str)
|
| 321 |
-
elif len(reference_node.xpath("person-group")) > 0:
|
| 322 |
-
for name_node in reference_node.xpath("person-group")[0]:
|
| 323 |
-
name_str = (
|
| 324 |
-
name_node.xpath("given-names")[0].text
|
| 325 |
-
+ " "
|
| 326 |
-
+ name_node.xpath("surname")[0].text
|
| 327 |
-
)
|
| 328 |
-
names.append(name_str)
|
| 329 |
-
reference["author_names"] = "; ".join(names)
|
| 330 |
-
|
| 331 |
-
# Title
|
| 332 |
-
if len(reference_node.xpath("article-title")) > 0:
|
| 333 |
-
reference["title"] = " ".join(
|
| 334 |
-
[
|
| 335 |
-
t.replace("\n", " ")
|
| 336 |
-
for t in reference_node.xpath("article-title")[0].itertext()
|
| 337 |
-
]
|
| 338 |
-
)
|
| 339 |
-
|
| 340 |
-
# Journal
|
| 341 |
-
if len(reference_node.xpath("source")) > 0:
|
| 342 |
-
reference["journal"] = reference_node.xpath("source")[0].text
|
| 343 |
-
|
| 344 |
-
# Year
|
| 345 |
-
if len(reference_node.xpath("year")) > 0:
|
| 346 |
-
reference["year"] = reference_node.xpath("year")[0].text
|
| 347 |
-
|
| 348 |
-
if (
|
| 349 |
-
not (reference_node.xpath("article-title"))
|
| 350 |
-
and not (reference_node.xpath("journal"))
|
| 351 |
-
and not (reference_node.xpath("year"))
|
| 352 |
-
):
|
| 353 |
-
reference["title"] = reference_node.text
|
| 354 |
-
|
| 355 |
-
references.append(reference)
|
| 356 |
-
return references
|
| 357 |
-
|
| 358 |
-
def _parse(self) -> XMLComponents:
|
| 359 |
-
"""Parsing PubMed document."""
|
| 360 |
-
xml_components: XMLComponents = {
|
| 361 |
-
"title": self._parse_title(),
|
| 362 |
-
"authors": self._parse_authors(),
|
| 363 |
-
"abstract": self._parse_abstract(),
|
| 364 |
-
"paragraphs": self._parse_main_text(),
|
| 365 |
-
"tables": self._parse_tables(),
|
| 366 |
-
"figure_captions": self._parse_figure_captions(),
|
| 367 |
-
"references": self._parse_references(),
|
| 368 |
-
}
|
| 369 |
-
return xml_components
|
| 370 |
-
|
| 371 |
-
def _populate_document(
|
| 372 |
-
self, doc: DoclingDocument, xml_components: XMLComponents
|
| 373 |
-
) -> DoclingDocument:
|
| 374 |
-
self._add_title(doc, xml_components)
|
| 375 |
-
self._add_authors(doc, xml_components)
|
| 376 |
-
self._add_abstract(doc, xml_components)
|
| 377 |
-
self._add_main_text(doc, xml_components)
|
| 378 |
-
|
| 379 |
-
if xml_components["tables"]:
|
| 380 |
-
self._add_tables(doc, xml_components)
|
| 381 |
-
|
| 382 |
-
if xml_components["figure_captions"]:
|
| 383 |
-
self._add_figure_captions(doc, xml_components)
|
| 384 |
-
|
| 385 |
-
self._add_references(doc, xml_components)
|
| 386 |
-
return doc
|
| 387 |
-
|
| 388 |
-
def _add_figure_captions(
|
| 389 |
-
self, doc: DoclingDocument, xml_components: XMLComponents
|
| 390 |
-
) -> None:
|
| 391 |
-
self.parents["Figures"] = doc.add_heading(
|
| 392 |
-
parent=self.parents["Title"], text="Figures"
|
| 393 |
-
)
|
| 394 |
-
for figure_caption_xml_component in xml_components["figure_captions"]:
|
| 395 |
-
figure_caption_text = (
|
| 396 |
-
figure_caption_xml_component["label"]
|
| 397 |
-
+ ": "
|
| 398 |
-
+ figure_caption_xml_component["caption"].strip()
|
| 399 |
-
)
|
| 400 |
-
fig_caption = doc.add_text(
|
| 401 |
-
label=DocItemLabel.CAPTION, text=figure_caption_text
|
| 402 |
-
)
|
| 403 |
-
doc.add_picture(
|
| 404 |
-
parent=self.parents["Figures"],
|
| 405 |
-
caption=fig_caption,
|
| 406 |
-
)
|
| 407 |
-
return
|
| 408 |
-
|
| 409 |
-
def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
|
| 410 |
-
self.parents["Title"] = doc.add_text(
|
| 411 |
-
parent=None,
|
| 412 |
-
text=xml_components["title"],
|
| 413 |
-
label=DocItemLabel.TITLE,
|
| 414 |
-
)
|
| 415 |
-
return
|
| 416 |
-
|
| 417 |
-
def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
|
| 418 |
-
authors_affiliations: list = []
|
| 419 |
-
for author in xml_components["authors"]:
|
| 420 |
-
authors_affiliations.append(author["name"])
|
| 421 |
-
authors_affiliations.append(", ".join(author["affiliation_names"]))
|
| 422 |
-
authors_affiliations_str = "; ".join(authors_affiliations)
|
| 423 |
-
|
| 424 |
-
doc.add_text(
|
| 425 |
-
parent=self.parents["Title"],
|
| 426 |
-
text=authors_affiliations_str,
|
| 427 |
-
label=DocItemLabel.PARAGRAPH,
|
| 428 |
-
)
|
| 429 |
-
return
|
| 430 |
-
|
| 431 |
-
def _add_abstract(
|
| 432 |
-
self, doc: DoclingDocument, xml_components: XMLComponents
|
| 433 |
-
) -> None:
|
| 434 |
-
abstract_text: str = xml_components["abstract"]
|
| 435 |
-
self.parents["Abstract"] = doc.add_heading(
|
| 436 |
-
parent=self.parents["Title"], text="Abstract"
|
| 437 |
-
)
|
| 438 |
-
doc.add_text(
|
| 439 |
-
parent=self.parents["Abstract"],
|
| 440 |
-
text=abstract_text,
|
| 441 |
-
label=DocItemLabel.TEXT,
|
| 442 |
-
)
|
| 443 |
-
return
|
| 444 |
-
|
| 445 |
-
def _add_main_text(
|
| 446 |
-
self, doc: DoclingDocument, xml_components: XMLComponents
|
| 447 |
-
) -> None:
|
| 448 |
-
added_headers: list = []
|
| 449 |
-
for paragraph in xml_components["paragraphs"]:
|
| 450 |
-
if not (paragraph["headers"]):
|
| 451 |
-
continue
|
| 452 |
-
|
| 453 |
-
# Header
|
| 454 |
-
for i, header in enumerate(reversed(paragraph["headers"])):
|
| 455 |
-
if header in added_headers:
|
| 456 |
-
continue
|
| 457 |
-
added_headers.append(header)
|
| 458 |
-
|
| 459 |
-
if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
|
| 460 |
-
i - 1
|
| 461 |
-
] in self.parents:
|
| 462 |
-
parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
|
| 463 |
-
else:
|
| 464 |
-
parent = self.parents["Title"]
|
| 465 |
-
|
| 466 |
-
self.parents[header] = doc.add_heading(parent=parent, text=header)
|
| 467 |
-
|
| 468 |
-
# Paragraph text
|
| 469 |
-
if paragraph["headers"][0] in self.parents:
|
| 470 |
-
parent = self.parents[paragraph["headers"][0]]
|
| 471 |
-
else:
|
| 472 |
-
parent = self.parents["Title"]
|
| 473 |
-
|
| 474 |
-
doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
|
| 475 |
-
return
|
| 476 |
-
|
| 477 |
-
def _add_references(
|
| 478 |
-
self, doc: DoclingDocument, xml_components: XMLComponents
|
| 479 |
-
) -> None:
|
| 480 |
-
self.parents["References"] = doc.add_heading(
|
| 481 |
-
parent=self.parents["Title"], text="References"
|
| 482 |
-
)
|
| 483 |
-
current_list = doc.add_group(
|
| 484 |
-
parent=self.parents["References"], label=GroupLabel.LIST, name="list"
|
| 485 |
-
)
|
| 486 |
-
for reference in xml_components["references"]:
|
| 487 |
-
reference_text: str = ""
|
| 488 |
-
if reference["author_names"]:
|
| 489 |
-
reference_text += reference["author_names"] + ". "
|
| 490 |
-
|
| 491 |
-
if reference["title"]:
|
| 492 |
-
reference_text += reference["title"]
|
| 493 |
-
if reference["title"][-1] != ".":
|
| 494 |
-
reference_text += "."
|
| 495 |
-
reference_text += " "
|
| 496 |
-
|
| 497 |
-
if reference["journal"]:
|
| 498 |
-
reference_text += reference["journal"]
|
| 499 |
-
|
| 500 |
-
if reference["year"]:
|
| 501 |
-
reference_text += " (" + reference["year"] + ")"
|
| 502 |
-
|
| 503 |
-
if not (reference_text):
|
| 504 |
-
_log.debug(f"Skipping reference for: {str(self.file)}")
|
| 505 |
-
continue
|
| 506 |
-
|
| 507 |
-
doc.add_list_item(
|
| 508 |
-
text=reference_text, enumerated=False, parent=current_list
|
| 509 |
-
)
|
| 510 |
-
return
|
| 511 |
-
|
| 512 |
-
def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
|
| 513 |
-
self.parents["Tables"] = doc.add_heading(
|
| 514 |
-
parent=self.parents["Title"], text="Tables"
|
| 515 |
-
)
|
| 516 |
-
for table_xml_component in xml_components["tables"]:
|
| 517 |
-
try:
|
| 518 |
-
self._add_table(doc, table_xml_component)
|
| 519 |
-
except Exception as e:
|
| 520 |
-
_log.debug(f"Skipping unsupported table for: {str(self.file)}")
|
| 521 |
-
pass
|
| 522 |
-
return
|
| 523 |
-
|
| 524 |
-
def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
|
| 525 |
-
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
|
| 526 |
-
table_tag = soup.find("table")
|
| 527 |
-
|
| 528 |
-
nested_tables = table_tag.find("table")
|
| 529 |
-
if nested_tables:
|
| 530 |
-
_log.debug(f"Skipping nested table for: {str(self.file)}")
|
| 531 |
-
return
|
| 532 |
-
|
| 533 |
-
# Count the number of rows (number of <tr> elements)
|
| 534 |
-
num_rows = len(table_tag.find_all("tr"))
|
| 535 |
-
|
| 536 |
-
# Find the number of columns (taking into account colspan)
|
| 537 |
-
num_cols = 0
|
| 538 |
-
for row in table_tag.find_all("tr"):
|
| 539 |
-
col_count = 0
|
| 540 |
-
for cell in row.find_all(["td", "th"]):
|
| 541 |
-
colspan = int(cell.get("colspan", 1))
|
| 542 |
-
col_count += colspan
|
| 543 |
-
num_cols = max(num_cols, col_count)
|
| 544 |
-
|
| 545 |
-
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
| 546 |
-
|
| 547 |
-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
| 548 |
-
|
| 549 |
-
# Iterate over the rows in the table
|
| 550 |
-
for row_idx, row in enumerate(table_tag.find_all("tr")):
|
| 551 |
-
# For each row, find all the column cells (both <td> and <th>)
|
| 552 |
-
cells = row.find_all(["td", "th"])
|
| 553 |
-
|
| 554 |
-
# Check if each cell in the row is a header -> means it is a column header
|
| 555 |
-
col_header = True
|
| 556 |
-
for j, html_cell in enumerate(cells):
|
| 557 |
-
if html_cell.name == "td":
|
| 558 |
-
col_header = False
|
| 559 |
-
|
| 560 |
-
# Extract and print the text content of each cell
|
| 561 |
-
col_idx = 0
|
| 562 |
-
for _, html_cell in enumerate(cells):
|
| 563 |
-
text = html_cell.text
|
| 564 |
-
|
| 565 |
-
col_span = int(html_cell.get("colspan", 1))
|
| 566 |
-
row_span = int(html_cell.get("rowspan", 1))
|
| 567 |
-
|
| 568 |
-
while grid[row_idx][col_idx] != None:
|
| 569 |
-
col_idx += 1
|
| 570 |
-
for r in range(row_span):
|
| 571 |
-
for c in range(col_span):
|
| 572 |
-
grid[row_idx + r][col_idx + c] = text
|
| 573 |
-
|
| 574 |
-
cell = TableCell(
|
| 575 |
-
text=text,
|
| 576 |
-
row_span=row_span,
|
| 577 |
-
col_span=col_span,
|
| 578 |
-
start_row_offset_idx=row_idx,
|
| 579 |
-
end_row_offset_idx=row_idx + row_span,
|
| 580 |
-
start_col_offset_idx=col_idx,
|
| 581 |
-
end_col_offset_idx=col_idx + col_span,
|
| 582 |
-
col_header=col_header,
|
| 583 |
-
row_header=((not col_header) and html_cell.name == "th"),
|
| 584 |
-
)
|
| 585 |
-
data.table_cells.append(cell)
|
| 586 |
-
|
| 587 |
-
table_caption = doc.add_text(
|
| 588 |
-
label=DocItemLabel.CAPTION,
|
| 589 |
-
text=table_xml_component["label"] + ": " + table_xml_component["caption"],
|
| 590 |
-
)
|
| 591 |
-
doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
|
| 592 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/uspto_backend.py
DELETED
|
@@ -1,1888 +0,0 @@
|
|
| 1 |
-
"""Backend to parse patents from the United States Patent Office (USPTO).
|
| 2 |
-
|
| 3 |
-
The parsers included in this module can handle patent grants pubished since 1976 and
|
| 4 |
-
patent applications since 2001.
|
| 5 |
-
The original files can be found in https://bulkdata.uspto.gov.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import html
|
| 9 |
-
import logging
|
| 10 |
-
import re
|
| 11 |
-
import xml.sax
|
| 12 |
-
import xml.sax.xmlreader
|
| 13 |
-
from abc import ABC, abstractmethod
|
| 14 |
-
from enum import Enum, unique
|
| 15 |
-
from io import BytesIO
|
| 16 |
-
from pathlib import Path
|
| 17 |
-
from typing import Any, Final, Optional, Union
|
| 18 |
-
|
| 19 |
-
from bs4 import BeautifulSoup, Tag
|
| 20 |
-
from docling_core.types.doc import (
|
| 21 |
-
DocItem,
|
| 22 |
-
DocItemLabel,
|
| 23 |
-
DoclingDocument,
|
| 24 |
-
DocumentOrigin,
|
| 25 |
-
TableCell,
|
| 26 |
-
TableData,
|
| 27 |
-
TextItem,
|
| 28 |
-
)
|
| 29 |
-
from docling_core.types.doc.document import LevelNumber
|
| 30 |
-
from pydantic import NonNegativeInt
|
| 31 |
-
from typing_extensions import Self, TypedDict, override
|
| 32 |
-
|
| 33 |
-
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
| 34 |
-
from docling.datamodel.base_models import InputFormat
|
| 35 |
-
from docling.datamodel.document import InputDocument
|
| 36 |
-
|
| 37 |
-
_log = logging.getLogger(__name__)
|
| 38 |
-
|
| 39 |
-
XML_DECLARATION: Final = '<?xml version="1.0" encoding="UTF-8"?>'
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
@unique
|
| 43 |
-
class PatentHeading(Enum):
|
| 44 |
-
"""Text of docling headings for tagged sections in USPTO patent documents."""
|
| 45 |
-
|
| 46 |
-
ABSTRACT = "ABSTRACT", 2
|
| 47 |
-
CLAIMS = "CLAIMS", 2
|
| 48 |
-
|
| 49 |
-
@override
|
| 50 |
-
def __new__(cls, value: str, _) -> Self:
|
| 51 |
-
obj = object.__new__(cls)
|
| 52 |
-
obj._value_ = value
|
| 53 |
-
return obj
|
| 54 |
-
|
| 55 |
-
@override
|
| 56 |
-
def __init__(self, _, level: LevelNumber) -> None:
|
| 57 |
-
self.level: LevelNumber = level
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
| 61 |
-
@override
|
| 62 |
-
def __init__(
|
| 63 |
-
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
| 64 |
-
) -> None:
|
| 65 |
-
super().__init__(in_doc, path_or_stream)
|
| 66 |
-
|
| 67 |
-
self.patent_content: str = ""
|
| 68 |
-
self.parser: Optional[PatentUspto] = None
|
| 69 |
-
|
| 70 |
-
try:
|
| 71 |
-
if isinstance(self.path_or_stream, BytesIO):
|
| 72 |
-
while line := self.path_or_stream.readline().decode("utf-8"):
|
| 73 |
-
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
| 74 |
-
self._set_parser(line)
|
| 75 |
-
self.patent_content += line
|
| 76 |
-
elif isinstance(self.path_or_stream, Path):
|
| 77 |
-
with open(self.path_or_stream, encoding="utf-8") as file_obj:
|
| 78 |
-
while line := file_obj.readline():
|
| 79 |
-
if line.startswith("<!DOCTYPE") or line == "PATN\n":
|
| 80 |
-
self._set_parser(line)
|
| 81 |
-
self.patent_content += line
|
| 82 |
-
except Exception as exc:
|
| 83 |
-
raise RuntimeError(
|
| 84 |
-
f"Could not initialize USPTO backend for file with hash {self.document_hash}."
|
| 85 |
-
) from exc
|
| 86 |
-
|
| 87 |
-
def _set_parser(self, doctype: str) -> None:
|
| 88 |
-
doctype_line = doctype.lower()
|
| 89 |
-
if doctype == "PATN\n":
|
| 90 |
-
self.parser = PatentUsptoGrantAps()
|
| 91 |
-
elif "us-patent-application-v4" in doctype_line:
|
| 92 |
-
self.parser = PatentUsptoIce()
|
| 93 |
-
elif "us-patent-grant-v4" in doctype_line:
|
| 94 |
-
self.parser = PatentUsptoIce()
|
| 95 |
-
elif "us-grant-025" in doctype_line:
|
| 96 |
-
self.parser = PatentUsptoGrantV2()
|
| 97 |
-
elif all(
|
| 98 |
-
item in doctype_line
|
| 99 |
-
for item in ("patent-application-publication", "pap-v1")
|
| 100 |
-
):
|
| 101 |
-
self.parser = PatentUsptoAppV1()
|
| 102 |
-
else:
|
| 103 |
-
self.parser = None
|
| 104 |
-
|
| 105 |
-
@override
|
| 106 |
-
def is_valid(self) -> bool:
|
| 107 |
-
return bool(self.patent_content) and bool(self.parser)
|
| 108 |
-
|
| 109 |
-
@classmethod
|
| 110 |
-
@override
|
| 111 |
-
def supports_pagination(cls) -> bool:
|
| 112 |
-
return False
|
| 113 |
-
|
| 114 |
-
@override
|
| 115 |
-
def unload(self) -> None:
|
| 116 |
-
return
|
| 117 |
-
|
| 118 |
-
@classmethod
|
| 119 |
-
@override
|
| 120 |
-
def supported_formats(cls) -> set[InputFormat]:
|
| 121 |
-
return {InputFormat.XML_USPTO}
|
| 122 |
-
|
| 123 |
-
@override
|
| 124 |
-
def convert(self) -> DoclingDocument:
|
| 125 |
-
|
| 126 |
-
if self.parser is not None:
|
| 127 |
-
doc = self.parser.parse(self.patent_content)
|
| 128 |
-
if doc is None:
|
| 129 |
-
raise RuntimeError(
|
| 130 |
-
f"Failed to convert doc (hash={self.document_hash}, "
|
| 131 |
-
f"name={self.file.name})."
|
| 132 |
-
)
|
| 133 |
-
doc.name = self.file.name or "file"
|
| 134 |
-
mime_type = (
|
| 135 |
-
"text/plain"
|
| 136 |
-
if isinstance(self.parser, PatentUsptoGrantAps)
|
| 137 |
-
else "application/xml"
|
| 138 |
-
)
|
| 139 |
-
doc.origin = DocumentOrigin(
|
| 140 |
-
mimetype=mime_type,
|
| 141 |
-
binary_hash=self.document_hash,
|
| 142 |
-
filename=self.file.name or "file",
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
return doc
|
| 146 |
-
else:
|
| 147 |
-
raise RuntimeError(
|
| 148 |
-
f"Cannot convert doc (hash={self.document_hash}, "
|
| 149 |
-
f"name={self.file.name}) because the backend failed to init."
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
class PatentUspto(ABC):
|
| 154 |
-
"""Parser of patent documents from the US Patent Office."""
|
| 155 |
-
|
| 156 |
-
@abstractmethod
|
| 157 |
-
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
| 158 |
-
"""Parse a USPTO patent.
|
| 159 |
-
|
| 160 |
-
Parameters:
|
| 161 |
-
patent_content: The content of a single patent in a USPTO file.
|
| 162 |
-
|
| 163 |
-
Returns:
|
| 164 |
-
The patent parsed as a docling document.
|
| 165 |
-
"""
|
| 166 |
-
pass
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
class PatentUsptoIce(PatentUspto):
|
| 170 |
-
"""Parser of patent documents from the US Patent Office (ICE).
|
| 171 |
-
|
| 172 |
-
The compatible formats are:
|
| 173 |
-
- Patent Grant Full Text Data/XML Version 4.x ICE (from January 2005)
|
| 174 |
-
- Patent Application Full Text Data/XML Version 4.x ICE (from January 2005)
|
| 175 |
-
"""
|
| 176 |
-
|
| 177 |
-
def __init__(self) -> None:
|
| 178 |
-
"""Build an instance of PatentUsptoIce class."""
|
| 179 |
-
self.handler = PatentUsptoIce.PatentHandler()
|
| 180 |
-
self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
|
| 181 |
-
|
| 182 |
-
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
| 183 |
-
try:
|
| 184 |
-
xml.sax.parseString(patent_content, self.handler)
|
| 185 |
-
except xml.sax._exceptions.SAXParseException as exc_sax:
|
| 186 |
-
_log.error(f"Error in parsing USPTO document: {exc_sax}")
|
| 187 |
-
|
| 188 |
-
return None
|
| 189 |
-
|
| 190 |
-
doc = self.handler.doc
|
| 191 |
-
if doc:
|
| 192 |
-
raw_tables = re.findall(self.pattern, patent_content)
|
| 193 |
-
parsed_tables: list[TableData] = []
|
| 194 |
-
_log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
|
| 195 |
-
for table in raw_tables:
|
| 196 |
-
table_parser = XmlTable(XML_DECLARATION + "\n" + table)
|
| 197 |
-
try:
|
| 198 |
-
table_data = table_parser.parse()
|
| 199 |
-
if table_data:
|
| 200 |
-
parsed_tables.append(table_data)
|
| 201 |
-
except Exception as exc_table:
|
| 202 |
-
_log.error(f"Error in parsing USPTO tables: {exc_table}")
|
| 203 |
-
if len(parsed_tables) != len(doc.tables):
|
| 204 |
-
_log.error(
|
| 205 |
-
f"Number of referenced ({len(doc.tables)}) and parsed "
|
| 206 |
-
f"({len(parsed_tables)}) tables differ."
|
| 207 |
-
)
|
| 208 |
-
else:
|
| 209 |
-
for idx, item in enumerate(parsed_tables):
|
| 210 |
-
doc.tables[idx].data = item
|
| 211 |
-
|
| 212 |
-
return doc
|
| 213 |
-
|
| 214 |
-
class PatentHandler(xml.sax.handler.ContentHandler):
|
| 215 |
-
"""SAX ContentHandler for patent documents."""
|
| 216 |
-
|
| 217 |
-
APP_DOC_ELEMENT: Final = "us-patent-application"
|
| 218 |
-
GRANT_DOC_ELEMENT: Final = "us-patent-grant"
|
| 219 |
-
|
| 220 |
-
@unique
|
| 221 |
-
class Element(Enum):
|
| 222 |
-
"""Represents an element of interest in the patent application document."""
|
| 223 |
-
|
| 224 |
-
ABSTRACT = "abstract", True
|
| 225 |
-
TITLE = "invention-title", True
|
| 226 |
-
CLAIMS = "claims", False
|
| 227 |
-
CLAIM = "claim", False
|
| 228 |
-
CLAIM_TEXT = "claim-text", True
|
| 229 |
-
PARAGRAPH = "p", True
|
| 230 |
-
HEADING = "heading", True
|
| 231 |
-
DESCRIPTION = "description", False
|
| 232 |
-
TABLE = "table", False # to track its position, without text
|
| 233 |
-
DRAWINGS = "description-of-drawings", True
|
| 234 |
-
STYLE_SUPERSCRIPT = "sup", True
|
| 235 |
-
STYLE_SUBSCRIPT = "sub", True
|
| 236 |
-
MATHS = "maths", False # to avoid keeping formulas
|
| 237 |
-
|
| 238 |
-
@override
|
| 239 |
-
def __new__(cls, value: str, _) -> Self:
|
| 240 |
-
obj = object.__new__(cls)
|
| 241 |
-
obj._value_ = value
|
| 242 |
-
return obj
|
| 243 |
-
|
| 244 |
-
@override
|
| 245 |
-
def __init__(self, _, is_text: bool) -> None:
|
| 246 |
-
self.is_text: bool = is_text
|
| 247 |
-
|
| 248 |
-
@override
|
| 249 |
-
def __init__(self) -> None:
|
| 250 |
-
"""Build an instance of the patent handler."""
|
| 251 |
-
# Current patent being parsed
|
| 252 |
-
self.doc: Optional[DoclingDocument] = None
|
| 253 |
-
# Keep track of docling hierarchy level
|
| 254 |
-
self.level: LevelNumber = 1
|
| 255 |
-
# Keep track of docling parents by level
|
| 256 |
-
self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
|
| 257 |
-
# Content to retain for the current patent
|
| 258 |
-
self.property: list[str]
|
| 259 |
-
self.claim: str
|
| 260 |
-
self.claims: list[str]
|
| 261 |
-
self.abstract: str
|
| 262 |
-
self.text: str
|
| 263 |
-
self._clean_data()
|
| 264 |
-
# To handle mathematical styling
|
| 265 |
-
self.style_html = HtmlEntity()
|
| 266 |
-
|
| 267 |
-
@override
|
| 268 |
-
def startElement(self, tag, attributes): # noqa: N802
|
| 269 |
-
"""Signal the start of an element.
|
| 270 |
-
|
| 271 |
-
Args:
|
| 272 |
-
tag: The element tag.
|
| 273 |
-
attributes: The element attributes.
|
| 274 |
-
"""
|
| 275 |
-
if tag in (
|
| 276 |
-
self.APP_DOC_ELEMENT,
|
| 277 |
-
self.GRANT_DOC_ELEMENT,
|
| 278 |
-
):
|
| 279 |
-
self.doc = DoclingDocument(name="file")
|
| 280 |
-
self.text = ""
|
| 281 |
-
self._start_registered_elements(tag, attributes)
|
| 282 |
-
|
| 283 |
-
@override
|
| 284 |
-
def skippedEntity(self, name): # noqa: N802
|
| 285 |
-
"""Receive notification of a skipped entity.
|
| 286 |
-
|
| 287 |
-
HTML entities will be skipped by the parser. This method will unescape them
|
| 288 |
-
and add them to the text.
|
| 289 |
-
|
| 290 |
-
Args:
|
| 291 |
-
name: Entity name.
|
| 292 |
-
"""
|
| 293 |
-
if self.property:
|
| 294 |
-
elm_val = self.property[-1]
|
| 295 |
-
element = self.Element(elm_val)
|
| 296 |
-
if element.is_text:
|
| 297 |
-
escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
|
| 298 |
-
unescaped = html.unescape(escaped)
|
| 299 |
-
if unescaped == escaped:
|
| 300 |
-
_log.debug(f"Unrecognized HTML entity: {name}")
|
| 301 |
-
return
|
| 302 |
-
|
| 303 |
-
if element in (
|
| 304 |
-
self.Element.STYLE_SUPERSCRIPT,
|
| 305 |
-
self.Element.STYLE_SUBSCRIPT,
|
| 306 |
-
):
|
| 307 |
-
# superscripts and subscripts need to be under text elements
|
| 308 |
-
if len(self.property) < 2:
|
| 309 |
-
return
|
| 310 |
-
parent_val = self.property[-2]
|
| 311 |
-
parent = self.Element(parent_val)
|
| 312 |
-
if parent.is_text:
|
| 313 |
-
self.text += self._apply_style(unescaped, elm_val)
|
| 314 |
-
else:
|
| 315 |
-
self.text += unescaped
|
| 316 |
-
|
| 317 |
-
@override
|
| 318 |
-
def endElement(self, tag): # noqa: N802
|
| 319 |
-
"""Signal the end of an element.
|
| 320 |
-
|
| 321 |
-
Args:
|
| 322 |
-
tag: The element tag.
|
| 323 |
-
"""
|
| 324 |
-
if tag in (
|
| 325 |
-
self.APP_DOC_ELEMENT,
|
| 326 |
-
self.GRANT_DOC_ELEMENT,
|
| 327 |
-
):
|
| 328 |
-
self._clean_data()
|
| 329 |
-
self._end_registered_element(tag)
|
| 330 |
-
|
| 331 |
-
@override
|
| 332 |
-
def characters(self, content):
|
| 333 |
-
"""Receive notification of character data.
|
| 334 |
-
|
| 335 |
-
Args:
|
| 336 |
-
content: Data reported by the handler.
|
| 337 |
-
"""
|
| 338 |
-
if self.property:
|
| 339 |
-
elm_val = self.property[-1]
|
| 340 |
-
element = self.Element(elm_val)
|
| 341 |
-
if element.is_text:
|
| 342 |
-
if element in (
|
| 343 |
-
self.Element.STYLE_SUPERSCRIPT,
|
| 344 |
-
self.Element.STYLE_SUBSCRIPT,
|
| 345 |
-
):
|
| 346 |
-
# superscripts and subscripts need to be under text elements
|
| 347 |
-
if len(self.property) < 2:
|
| 348 |
-
return
|
| 349 |
-
parent_val = self.property[-2]
|
| 350 |
-
parent = self.Element(parent_val)
|
| 351 |
-
if parent.is_text:
|
| 352 |
-
self.text += self._apply_style(content, elm_val)
|
| 353 |
-
else:
|
| 354 |
-
self.text += content
|
| 355 |
-
|
| 356 |
-
def _start_registered_elements(
|
| 357 |
-
self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
|
| 358 |
-
) -> None:
|
| 359 |
-
if tag in [member.value for member in self.Element]:
|
| 360 |
-
# special case for claims: claim lines may start before the
|
| 361 |
-
# previous one is closed
|
| 362 |
-
if (
|
| 363 |
-
tag == self.Element.CLAIM_TEXT.value
|
| 364 |
-
and self.property
|
| 365 |
-
and self.property[-1] == tag
|
| 366 |
-
and self.text.strip()
|
| 367 |
-
):
|
| 368 |
-
self.claim += " " + self.text.strip()
|
| 369 |
-
self.text = ""
|
| 370 |
-
elif tag == self.Element.HEADING.value:
|
| 371 |
-
level_attr: str = attributes.get("level", "")
|
| 372 |
-
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
| 373 |
-
max_level = min(self.parents.keys())
|
| 374 |
-
# increase heading level with 1 for title, if any
|
| 375 |
-
self.level = (
|
| 376 |
-
new_level + 1 if (new_level + 1) in self.parents else max_level
|
| 377 |
-
)
|
| 378 |
-
self.property.append(tag)
|
| 379 |
-
|
| 380 |
-
def _end_registered_element(self, tag: str) -> None:
|
| 381 |
-
if tag in [item.value for item in self.Element] and self.property:
|
| 382 |
-
current_tag = self.property.pop()
|
| 383 |
-
self._add_property(current_tag, self.text.strip())
|
| 384 |
-
|
| 385 |
-
def _add_property(self, name: str, text: str) -> None:
|
| 386 |
-
if not name or not self.doc:
|
| 387 |
-
return
|
| 388 |
-
|
| 389 |
-
if name == self.Element.TITLE.value:
|
| 390 |
-
if text:
|
| 391 |
-
self.parents[self.level + 1] = self.doc.add_title(
|
| 392 |
-
parent=self.parents[self.level],
|
| 393 |
-
text=text,
|
| 394 |
-
)
|
| 395 |
-
self.level += 1
|
| 396 |
-
self.text = ""
|
| 397 |
-
|
| 398 |
-
elif name == self.Element.ABSTRACT.value:
|
| 399 |
-
if self.abstract:
|
| 400 |
-
heading_text = PatentHeading.ABSTRACT.value
|
| 401 |
-
heading_level = (
|
| 402 |
-
PatentHeading.ABSTRACT.level
|
| 403 |
-
if PatentHeading.ABSTRACT.level in self.parents
|
| 404 |
-
else 1
|
| 405 |
-
)
|
| 406 |
-
abstract_item = self.doc.add_heading(
|
| 407 |
-
heading_text,
|
| 408 |
-
level=heading_level,
|
| 409 |
-
parent=self.parents[heading_level],
|
| 410 |
-
)
|
| 411 |
-
self.doc.add_text(
|
| 412 |
-
label=DocItemLabel.PARAGRAPH,
|
| 413 |
-
text=self.abstract,
|
| 414 |
-
parent=abstract_item,
|
| 415 |
-
)
|
| 416 |
-
|
| 417 |
-
elif name == self.Element.CLAIM_TEXT.value:
|
| 418 |
-
text = re.sub("\\s+", " ", text).strip()
|
| 419 |
-
if text:
|
| 420 |
-
self.claim += " " + text
|
| 421 |
-
self.text = ""
|
| 422 |
-
|
| 423 |
-
elif name == self.Element.CLAIM.value and self.claim:
|
| 424 |
-
self.claims.append(self.claim.strip())
|
| 425 |
-
self.claim = ""
|
| 426 |
-
|
| 427 |
-
elif name == self.Element.CLAIMS.value and self.claims:
|
| 428 |
-
heading_text = PatentHeading.CLAIMS.value
|
| 429 |
-
heading_level = (
|
| 430 |
-
PatentHeading.CLAIMS.level
|
| 431 |
-
if PatentHeading.CLAIMS.level in self.parents
|
| 432 |
-
else 1
|
| 433 |
-
)
|
| 434 |
-
claims_item = self.doc.add_heading(
|
| 435 |
-
heading_text,
|
| 436 |
-
level=heading_level,
|
| 437 |
-
parent=self.parents[heading_level],
|
| 438 |
-
)
|
| 439 |
-
for text in self.claims:
|
| 440 |
-
self.doc.add_text(
|
| 441 |
-
label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
|
| 442 |
-
)
|
| 443 |
-
|
| 444 |
-
elif name == self.Element.PARAGRAPH.value and text:
|
| 445 |
-
# remmove blank spaces added in paragraphs
|
| 446 |
-
text = re.sub("\\s+", " ", text)
|
| 447 |
-
if self.Element.ABSTRACT.value in self.property:
|
| 448 |
-
self.abstract = (
|
| 449 |
-
(self.abstract + " " + text) if self.abstract else text
|
| 450 |
-
)
|
| 451 |
-
else:
|
| 452 |
-
self.doc.add_text(
|
| 453 |
-
label=DocItemLabel.PARAGRAPH,
|
| 454 |
-
text=text,
|
| 455 |
-
parent=self.parents[self.level],
|
| 456 |
-
)
|
| 457 |
-
self.text = ""
|
| 458 |
-
|
| 459 |
-
elif name == self.Element.HEADING.value and text:
|
| 460 |
-
self.parents[self.level + 1] = self.doc.add_heading(
|
| 461 |
-
text=text,
|
| 462 |
-
level=self.level,
|
| 463 |
-
parent=self.parents[self.level],
|
| 464 |
-
)
|
| 465 |
-
self.level += 1
|
| 466 |
-
self.text = ""
|
| 467 |
-
|
| 468 |
-
elif name == self.Element.TABLE.value:
|
| 469 |
-
# set an empty table as placeholder
|
| 470 |
-
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
| 471 |
-
self.doc.add_table(
|
| 472 |
-
data=empty_table,
|
| 473 |
-
parent=self.parents[self.level],
|
| 474 |
-
)
|
| 475 |
-
|
| 476 |
-
def _apply_style(self, text: str, style_tag: str) -> str:
|
| 477 |
-
"""Apply an HTML style to text.
|
| 478 |
-
|
| 479 |
-
Args:
|
| 480 |
-
text: A string containing plain text.
|
| 481 |
-
style_tag: An HTML tag name for styling text. If the tag name is not
|
| 482 |
-
recognized as one of the supported styles, the method will return
|
| 483 |
-
the original `text`.
|
| 484 |
-
|
| 485 |
-
Returns:
|
| 486 |
-
A string after applying the style.
|
| 487 |
-
"""
|
| 488 |
-
formatted = text
|
| 489 |
-
|
| 490 |
-
if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
|
| 491 |
-
formatted = html.unescape(self.style_html.get_superscript(text))
|
| 492 |
-
elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
|
| 493 |
-
formatted = html.unescape(self.style_html.get_subscript(text))
|
| 494 |
-
|
| 495 |
-
return formatted
|
| 496 |
-
|
| 497 |
-
def _clean_data(self) -> None:
|
| 498 |
-
"""Reset the variables from stream data."""
|
| 499 |
-
self.property = []
|
| 500 |
-
self.claim = ""
|
| 501 |
-
self.claims = []
|
| 502 |
-
self.abstract = ""
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
class PatentUsptoGrantV2(PatentUspto):
|
| 506 |
-
"""Parser of patent documents from the US Patent Office (grants v2.5).
|
| 507 |
-
|
| 508 |
-
The compatible format is:
|
| 509 |
-
- Patent Grant Full Text Data/XML Version 2.5 (from January 2002 till December 2004)
|
| 510 |
-
"""
|
| 511 |
-
|
| 512 |
-
@override
|
| 513 |
-
def __init__(self) -> None:
|
| 514 |
-
"""Build an instance of PatentUsptoGrantV2 class."""
|
| 515 |
-
self.handler = PatentUsptoGrantV2.PatentHandler()
|
| 516 |
-
self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
|
| 517 |
-
|
| 518 |
-
@override
|
| 519 |
-
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
| 520 |
-
try:
|
| 521 |
-
xml.sax.parseString(patent_content, self.handler)
|
| 522 |
-
except xml.sax._exceptions.SAXParseException as exc_sax:
|
| 523 |
-
_log.error(f"Error in parsing USPTO document: {exc_sax}")
|
| 524 |
-
|
| 525 |
-
return None
|
| 526 |
-
|
| 527 |
-
doc = self.handler.doc
|
| 528 |
-
if doc:
|
| 529 |
-
raw_tables = re.findall(self.pattern, patent_content)
|
| 530 |
-
parsed_tables: list[TableData] = []
|
| 531 |
-
_log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
|
| 532 |
-
for table in raw_tables:
|
| 533 |
-
table_parser = XmlTable(XML_DECLARATION + "\n" + table)
|
| 534 |
-
try:
|
| 535 |
-
table_data = table_parser.parse()
|
| 536 |
-
if table_data:
|
| 537 |
-
parsed_tables.append(table_data)
|
| 538 |
-
except Exception as exc_table:
|
| 539 |
-
_log.error(f"Error in parsing USPTO tables: {exc_table}")
|
| 540 |
-
if len(parsed_tables) != len(doc.tables):
|
| 541 |
-
_log.error(
|
| 542 |
-
f"Number of referenced ({len(doc.tables)}) and parsed "
|
| 543 |
-
f"({len(parsed_tables)}) tables differ."
|
| 544 |
-
)
|
| 545 |
-
else:
|
| 546 |
-
for idx, item in enumerate(parsed_tables):
|
| 547 |
-
doc.tables[idx].data = item
|
| 548 |
-
|
| 549 |
-
return doc
|
| 550 |
-
|
| 551 |
-
class PatentHandler(xml.sax.handler.ContentHandler):
|
| 552 |
-
"""SAX ContentHandler for patent documents."""
|
| 553 |
-
|
| 554 |
-
GRANT_DOC_ELEMENT: Final = "PATDOC"
|
| 555 |
-
CLAIM_STATEMENT: Final = "What is claimed is:"
|
| 556 |
-
|
| 557 |
-
@unique
|
| 558 |
-
class Element(Enum):
|
| 559 |
-
"""Represents an element of interest in the patent application document."""
|
| 560 |
-
|
| 561 |
-
PDAT = "PDAT", True # any type of data
|
| 562 |
-
ABSTRACT = ("SDOAB", False)
|
| 563 |
-
SDOCL = ("SDOCL", False)
|
| 564 |
-
TITLE = ("B540", False)
|
| 565 |
-
CLAIMS = ("CL", False)
|
| 566 |
-
CLAIM = ("CLM", False)
|
| 567 |
-
PARAGRAPH = ("PARA", True)
|
| 568 |
-
HEADING = ("H", True)
|
| 569 |
-
DRAWINGS = ("DRWDESC", False)
|
| 570 |
-
STYLE_SUPERSCRIPT = ("SP", False)
|
| 571 |
-
STYLE_SUBSCRIPT = ("SB", False)
|
| 572 |
-
STYLE_ITALIC = ("ITALIC", False)
|
| 573 |
-
CWU = ("CWU", False) # avoid tables, chemicals, formulas
|
| 574 |
-
TABLE = ("table", False) # to keep track of table positions
|
| 575 |
-
|
| 576 |
-
@override
|
| 577 |
-
def __new__(cls, value: str, _) -> Self:
|
| 578 |
-
obj = object.__new__(cls)
|
| 579 |
-
obj._value_ = value
|
| 580 |
-
return obj
|
| 581 |
-
|
| 582 |
-
@override
|
| 583 |
-
def __init__(self, _, is_text: bool) -> None:
|
| 584 |
-
self.is_text: bool = is_text
|
| 585 |
-
|
| 586 |
-
@override
|
| 587 |
-
def __init__(self) -> None:
|
| 588 |
-
"""Build an instance of the patent handler."""
|
| 589 |
-
# Current patent being parsed
|
| 590 |
-
self.doc: Optional[DoclingDocument] = None
|
| 591 |
-
# Keep track of docling hierarchy level
|
| 592 |
-
self.level: LevelNumber = 1
|
| 593 |
-
# Keep track of docling parents by level
|
| 594 |
-
self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
|
| 595 |
-
# Content to retain for the current patent
|
| 596 |
-
self.property: list[str]
|
| 597 |
-
self.claim: str
|
| 598 |
-
self.claims: list[str]
|
| 599 |
-
self.paragraph: str
|
| 600 |
-
self.abstract: str
|
| 601 |
-
self._clean_data()
|
| 602 |
-
# To handle mathematical styling
|
| 603 |
-
self.style_html = HtmlEntity()
|
| 604 |
-
|
| 605 |
-
@override
|
| 606 |
-
def startElement(self, tag, attributes): # noqa: N802
|
| 607 |
-
"""Signal the start of an element.
|
| 608 |
-
|
| 609 |
-
Args:
|
| 610 |
-
tag: The element tag.
|
| 611 |
-
attributes: The element attributes.
|
| 612 |
-
"""
|
| 613 |
-
if tag == self.GRANT_DOC_ELEMENT:
|
| 614 |
-
self.doc = DoclingDocument(name="file")
|
| 615 |
-
self.text = ""
|
| 616 |
-
self._start_registered_elements(tag, attributes)
|
| 617 |
-
|
| 618 |
-
@override
|
| 619 |
-
def skippedEntity(self, name): # noqa: N802
|
| 620 |
-
"""Receive notification of a skipped entity.
|
| 621 |
-
|
| 622 |
-
HTML entities will be skipped by the parser. This method will unescape them
|
| 623 |
-
and add them to the text.
|
| 624 |
-
|
| 625 |
-
Args:
|
| 626 |
-
name: Entity name.
|
| 627 |
-
"""
|
| 628 |
-
if self.property:
|
| 629 |
-
elm_val = self.property[-1]
|
| 630 |
-
element = self.Element(elm_val)
|
| 631 |
-
if element.is_text:
|
| 632 |
-
escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
|
| 633 |
-
unescaped = html.unescape(escaped)
|
| 634 |
-
if unescaped == escaped:
|
| 635 |
-
logging.debug("Unrecognized HTML entity: " + name)
|
| 636 |
-
return
|
| 637 |
-
|
| 638 |
-
if element in (
|
| 639 |
-
self.Element.STYLE_SUPERSCRIPT,
|
| 640 |
-
self.Element.STYLE_SUBSCRIPT,
|
| 641 |
-
):
|
| 642 |
-
# superscripts and subscripts need to be under text elements
|
| 643 |
-
if len(self.property) < 2:
|
| 644 |
-
return
|
| 645 |
-
parent_val = self.property[-2]
|
| 646 |
-
parent = self.Element(parent_val)
|
| 647 |
-
if parent.is_text:
|
| 648 |
-
self.text += self._apply_style(unescaped, elm_val)
|
| 649 |
-
else:
|
| 650 |
-
self.text += unescaped
|
| 651 |
-
|
| 652 |
-
@override
|
| 653 |
-
def endElement(self, tag): # noqa: N802
|
| 654 |
-
"""Signal the end of an element.
|
| 655 |
-
|
| 656 |
-
Args:
|
| 657 |
-
tag: The element tag.
|
| 658 |
-
"""
|
| 659 |
-
if tag == self.GRANT_DOC_ELEMENT:
|
| 660 |
-
self._clean_data()
|
| 661 |
-
self._end_registered_element(tag)
|
| 662 |
-
|
| 663 |
-
@override
|
| 664 |
-
def characters(self, content):
|
| 665 |
-
"""Receive notification of character data.
|
| 666 |
-
|
| 667 |
-
Args:
|
| 668 |
-
content: Data reported by the handler.
|
| 669 |
-
"""
|
| 670 |
-
if self.property:
|
| 671 |
-
elm_val = self.property[-1]
|
| 672 |
-
element = self.Element(elm_val)
|
| 673 |
-
if element.is_text:
|
| 674 |
-
if element in (
|
| 675 |
-
self.Element.STYLE_SUPERSCRIPT,
|
| 676 |
-
self.Element.STYLE_SUBSCRIPT,
|
| 677 |
-
):
|
| 678 |
-
# superscripts and subscripts need to be under text elements
|
| 679 |
-
if len(self.property) < 2:
|
| 680 |
-
return
|
| 681 |
-
parent_val = self.property[-2]
|
| 682 |
-
parent = self.Element(parent_val)
|
| 683 |
-
if parent.is_text:
|
| 684 |
-
self.text += self._apply_style(content, elm_val)
|
| 685 |
-
else:
|
| 686 |
-
self.text += content
|
| 687 |
-
|
| 688 |
-
def _start_registered_elements(
|
| 689 |
-
self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
|
| 690 |
-
) -> None:
|
| 691 |
-
if tag in [member.value for member in self.Element]:
|
| 692 |
-
if (
|
| 693 |
-
tag == self.Element.HEADING.value
|
| 694 |
-
and not self.Element.SDOCL.value in self.property
|
| 695 |
-
):
|
| 696 |
-
level_attr: str = attributes.get("LVL", "")
|
| 697 |
-
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
| 698 |
-
max_level = min(self.parents.keys())
|
| 699 |
-
# increase heading level with 1 for title, if any
|
| 700 |
-
self.level = (
|
| 701 |
-
new_level + 1 if (new_level + 1) in self.parents else max_level
|
| 702 |
-
)
|
| 703 |
-
self.property.append(tag)
|
| 704 |
-
|
| 705 |
-
def _end_registered_element(self, tag: str) -> None:
|
| 706 |
-
if tag in [elm.value for elm in self.Element] and self.property:
|
| 707 |
-
current_tag = self.property.pop()
|
| 708 |
-
self._add_property(current_tag, self.text)
|
| 709 |
-
|
| 710 |
-
def _add_property(self, name: str, text: str) -> None:
|
| 711 |
-
if not name or not self.doc:
|
| 712 |
-
return
|
| 713 |
-
if name == self.Element.PDAT.value and text:
|
| 714 |
-
if not self.property:
|
| 715 |
-
self.text = ""
|
| 716 |
-
return
|
| 717 |
-
|
| 718 |
-
wrapper = self.property[-1]
|
| 719 |
-
text = self._apply_style(text, wrapper)
|
| 720 |
-
|
| 721 |
-
if self.Element.TITLE.value in self.property and text.strip():
|
| 722 |
-
title = text.strip()
|
| 723 |
-
self.parents[self.level + 1] = self.doc.add_title(
|
| 724 |
-
parent=self.parents[self.level],
|
| 725 |
-
text=title,
|
| 726 |
-
)
|
| 727 |
-
self.level += 1
|
| 728 |
-
|
| 729 |
-
elif self.Element.ABSTRACT.value in self.property:
|
| 730 |
-
self.abstract += text
|
| 731 |
-
|
| 732 |
-
elif self.Element.CLAIM.value in self.property:
|
| 733 |
-
self.claim += text
|
| 734 |
-
|
| 735 |
-
# Paragraph text not in claims or abstract
|
| 736 |
-
elif (
|
| 737 |
-
self.Element.PARAGRAPH.value in self.property
|
| 738 |
-
and self.Element.CLAIM.value not in self.property
|
| 739 |
-
and self.Element.ABSTRACT.value not in self.property
|
| 740 |
-
):
|
| 741 |
-
self.paragraph += text
|
| 742 |
-
|
| 743 |
-
# headers except claims statement
|
| 744 |
-
elif (
|
| 745 |
-
self.Element.HEADING.value in self.property
|
| 746 |
-
and not self.Element.SDOCL.value in self.property
|
| 747 |
-
and text.strip()
|
| 748 |
-
):
|
| 749 |
-
self.parents[self.level + 1] = self.doc.add_heading(
|
| 750 |
-
text=text.strip(),
|
| 751 |
-
level=self.level,
|
| 752 |
-
parent=self.parents[self.level],
|
| 753 |
-
)
|
| 754 |
-
self.level += 1
|
| 755 |
-
|
| 756 |
-
self.text = ""
|
| 757 |
-
|
| 758 |
-
elif name == self.Element.CLAIM.value and self.claim.strip():
|
| 759 |
-
self.claims.append(self.claim.strip())
|
| 760 |
-
self.claim = ""
|
| 761 |
-
|
| 762 |
-
elif name == self.Element.CLAIMS.value and self.claims:
|
| 763 |
-
heading_text = PatentHeading.CLAIMS.value
|
| 764 |
-
heading_level = (
|
| 765 |
-
PatentHeading.CLAIMS.level
|
| 766 |
-
if PatentHeading.CLAIMS.level in self.parents
|
| 767 |
-
else 1
|
| 768 |
-
)
|
| 769 |
-
claims_item = self.doc.add_heading(
|
| 770 |
-
heading_text,
|
| 771 |
-
level=heading_level,
|
| 772 |
-
parent=self.parents[heading_level],
|
| 773 |
-
)
|
| 774 |
-
for text in self.claims:
|
| 775 |
-
self.doc.add_text(
|
| 776 |
-
label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
|
| 777 |
-
)
|
| 778 |
-
|
| 779 |
-
elif name == self.Element.ABSTRACT.value and self.abstract.strip():
|
| 780 |
-
abstract = self.abstract.strip()
|
| 781 |
-
heading_text = PatentHeading.ABSTRACT.value
|
| 782 |
-
heading_level = (
|
| 783 |
-
PatentHeading.ABSTRACT.level
|
| 784 |
-
if PatentHeading.ABSTRACT.level in self.parents
|
| 785 |
-
else 1
|
| 786 |
-
)
|
| 787 |
-
abstract_item = self.doc.add_heading(
|
| 788 |
-
heading_text,
|
| 789 |
-
level=heading_level,
|
| 790 |
-
parent=self.parents[heading_level],
|
| 791 |
-
)
|
| 792 |
-
self.doc.add_text(
|
| 793 |
-
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
|
| 794 |
-
)
|
| 795 |
-
|
| 796 |
-
elif name == self.Element.PARAGRAPH.value:
|
| 797 |
-
paragraph = self.paragraph.strip()
|
| 798 |
-
if paragraph and self.Element.CLAIM.value not in self.property:
|
| 799 |
-
self.doc.add_text(
|
| 800 |
-
label=DocItemLabel.PARAGRAPH,
|
| 801 |
-
text=paragraph,
|
| 802 |
-
parent=self.parents[self.level],
|
| 803 |
-
)
|
| 804 |
-
elif self.Element.CLAIM.value in self.property:
|
| 805 |
-
# we may need a space after a paragraph in claim text
|
| 806 |
-
self.claim += " "
|
| 807 |
-
self.paragraph = ""
|
| 808 |
-
|
| 809 |
-
elif name == self.Element.TABLE.value:
|
| 810 |
-
# set an empty table as placeholder
|
| 811 |
-
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
| 812 |
-
self.doc.add_table(
|
| 813 |
-
data=empty_table,
|
| 814 |
-
parent=self.parents[self.level],
|
| 815 |
-
)
|
| 816 |
-
|
| 817 |
-
def _apply_style(self, text: str, style_tag: str) -> str:
|
| 818 |
-
"""Apply an HTML style to text.
|
| 819 |
-
|
| 820 |
-
Args:
|
| 821 |
-
text: A string containing plain text.
|
| 822 |
-
style_tag: An HTML tag name for styling text. If the tag name is not
|
| 823 |
-
recognized as one of the supported styles, the method will return
|
| 824 |
-
the original `text`.
|
| 825 |
-
|
| 826 |
-
Returns:
|
| 827 |
-
A string after applying the style.
|
| 828 |
-
"""
|
| 829 |
-
formatted = text
|
| 830 |
-
|
| 831 |
-
if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
|
| 832 |
-
formatted = html.unescape(self.style_html.get_superscript(text))
|
| 833 |
-
elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
|
| 834 |
-
formatted = html.unescape(self.style_html.get_subscript(text))
|
| 835 |
-
elif style_tag == self.Element.STYLE_ITALIC.value:
|
| 836 |
-
formatted = html.unescape(self.style_html.get_math_italic(text))
|
| 837 |
-
|
| 838 |
-
return formatted
|
| 839 |
-
|
| 840 |
-
def _clean_data(self) -> None:
|
| 841 |
-
"""Reset the variables from stream data."""
|
| 842 |
-
self.text = ""
|
| 843 |
-
self.property = []
|
| 844 |
-
self.claim = ""
|
| 845 |
-
self.claims = []
|
| 846 |
-
self.paragraph = ""
|
| 847 |
-
self.abstract = ""
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
class PatentUsptoGrantAps(PatentUspto):
|
| 851 |
-
"""Parser of patents documents from the US Patent Office (grants APS).
|
| 852 |
-
|
| 853 |
-
The compatible format is:
|
| 854 |
-
- Patent Grant Full Text Data/APS (from January 1976 till December 2001)
|
| 855 |
-
"""
|
| 856 |
-
|
| 857 |
-
@unique
|
| 858 |
-
class Section(Enum):
|
| 859 |
-
"""Represent a section in a patent APS document."""
|
| 860 |
-
|
| 861 |
-
ABSTRACT = "ABST"
|
| 862 |
-
SUMMARY = "BSUM"
|
| 863 |
-
DETAILS = "DETD"
|
| 864 |
-
CLAIMS = "CLMS"
|
| 865 |
-
DRAWINGS = "DRWD"
|
| 866 |
-
|
| 867 |
-
@unique
|
| 868 |
-
class Field(Enum):
|
| 869 |
-
"""Represent a field in a patent APS document."""
|
| 870 |
-
|
| 871 |
-
DOC_NUMBER = "WKU"
|
| 872 |
-
TITLE = "TTL"
|
| 873 |
-
PARAGRAPH = "PAR"
|
| 874 |
-
PARAGRAPH_1 = "PA1"
|
| 875 |
-
PARAGRAPH_2 = "PA2"
|
| 876 |
-
PARAGRAPH_3 = "PA3"
|
| 877 |
-
TEXT = "PAL"
|
| 878 |
-
CAPTION = "PAC"
|
| 879 |
-
NUMBER = "NUM"
|
| 880 |
-
NAME = "NAM"
|
| 881 |
-
IPC = "ICL"
|
| 882 |
-
ISSUED = "ISD"
|
| 883 |
-
FILED = "APD"
|
| 884 |
-
PATENT_NUMBER = "PNO"
|
| 885 |
-
APPLICATION_NUMBER = "APN"
|
| 886 |
-
APPLICATION_TYPE = "APT"
|
| 887 |
-
COUNTRY = "CNT"
|
| 888 |
-
|
| 889 |
-
@override
|
| 890 |
-
def __init__(self) -> None:
|
| 891 |
-
"""Build an instance of PatentUsptoGrantAps class."""
|
| 892 |
-
self.doc: Optional[DoclingDocument] = None
|
| 893 |
-
# Keep track of docling hierarchy level
|
| 894 |
-
self.level: LevelNumber = 1
|
| 895 |
-
# Keep track of docling parents by level
|
| 896 |
-
self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
|
| 897 |
-
|
| 898 |
-
def get_last_text_item(self) -> Optional[TextItem]:
|
| 899 |
-
"""Get the last text item at the current document level.
|
| 900 |
-
|
| 901 |
-
Returns:
|
| 902 |
-
The text item or None, if the current level parent has no children."""
|
| 903 |
-
if self.doc:
|
| 904 |
-
parent = self.parents[self.level]
|
| 905 |
-
children = parent.children if parent is not None else []
|
| 906 |
-
else:
|
| 907 |
-
return None
|
| 908 |
-
text_list: list[TextItem] = [
|
| 909 |
-
item
|
| 910 |
-
for item in self.doc.texts
|
| 911 |
-
if isinstance(item, TextItem) and item.get_ref() in children
|
| 912 |
-
]
|
| 913 |
-
|
| 914 |
-
if text_list:
|
| 915 |
-
return text_list[-1]
|
| 916 |
-
else:
|
| 917 |
-
return None
|
| 918 |
-
|
| 919 |
-
def store_section(self, section: str) -> None:
|
| 920 |
-
"""Store the section heading in the docling document.
|
| 921 |
-
|
| 922 |
-
Only the predefined sections from PatentHeading will be handled.
|
| 923 |
-
The other sections are created by the Field.CAPTION field.
|
| 924 |
-
|
| 925 |
-
Args:
|
| 926 |
-
section: A patent section name."""
|
| 927 |
-
heading: PatentHeading
|
| 928 |
-
if self.doc is None:
|
| 929 |
-
return
|
| 930 |
-
elif section == self.Section.ABSTRACT.value:
|
| 931 |
-
heading = PatentHeading.ABSTRACT
|
| 932 |
-
elif section == self.Section.CLAIMS.value:
|
| 933 |
-
heading = PatentHeading.CLAIMS
|
| 934 |
-
else:
|
| 935 |
-
return None
|
| 936 |
-
|
| 937 |
-
self.level = heading.level if heading.level in self.parents else 1
|
| 938 |
-
self.parents[self.level + 1] = self.doc.add_heading(
|
| 939 |
-
heading.value,
|
| 940 |
-
level=self.level,
|
| 941 |
-
parent=self.parents[self.level],
|
| 942 |
-
)
|
| 943 |
-
self.level += 1
|
| 944 |
-
|
| 945 |
-
def store_content(self, section: str, field: str, value: str) -> None:
|
| 946 |
-
"""Store the key value within a document section in the docling document.
|
| 947 |
-
|
| 948 |
-
Args:
|
| 949 |
-
section: A patent section name.
|
| 950 |
-
field: A field name.
|
| 951 |
-
value: A field value name.
|
| 952 |
-
"""
|
| 953 |
-
if (
|
| 954 |
-
not self.doc
|
| 955 |
-
or not field
|
| 956 |
-
or field not in [item.value for item in PatentUsptoGrantAps.Field]
|
| 957 |
-
):
|
| 958 |
-
return
|
| 959 |
-
|
| 960 |
-
if field == self.Field.TITLE.value:
|
| 961 |
-
self.parents[self.level + 1] = self.doc.add_title(
|
| 962 |
-
parent=self.parents[self.level], text=value
|
| 963 |
-
)
|
| 964 |
-
self.level += 1
|
| 965 |
-
|
| 966 |
-
elif field == self.Field.TEXT.value and section == self.Section.ABSTRACT.value:
|
| 967 |
-
abst_item = self.get_last_text_item()
|
| 968 |
-
if abst_item:
|
| 969 |
-
abst_item.text += " " + value
|
| 970 |
-
else:
|
| 971 |
-
self.doc.add_text(
|
| 972 |
-
label=DocItemLabel.PARAGRAPH,
|
| 973 |
-
text=value,
|
| 974 |
-
parent=self.parents[self.level],
|
| 975 |
-
)
|
| 976 |
-
|
| 977 |
-
elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
|
| 978 |
-
self.doc.add_text(
|
| 979 |
-
label=DocItemLabel.PARAGRAPH,
|
| 980 |
-
text="",
|
| 981 |
-
parent=self.parents[self.level],
|
| 982 |
-
)
|
| 983 |
-
|
| 984 |
-
elif (
|
| 985 |
-
field
|
| 986 |
-
in (
|
| 987 |
-
self.Field.PARAGRAPH.value,
|
| 988 |
-
self.Field.PARAGRAPH_1.value,
|
| 989 |
-
self.Field.PARAGRAPH_2.value,
|
| 990 |
-
self.Field.PARAGRAPH_3.value,
|
| 991 |
-
)
|
| 992 |
-
and section == self.Section.CLAIMS.value
|
| 993 |
-
):
|
| 994 |
-
last_claim = self.get_last_text_item()
|
| 995 |
-
if last_claim is None:
|
| 996 |
-
last_claim = self.doc.add_text(
|
| 997 |
-
label=DocItemLabel.PARAGRAPH,
|
| 998 |
-
text="",
|
| 999 |
-
parent=self.parents[self.level],
|
| 1000 |
-
)
|
| 1001 |
-
|
| 1002 |
-
last_claim.text += f" {value}" if last_claim.text else value
|
| 1003 |
-
|
| 1004 |
-
elif field == self.Field.CAPTION.value and section in (
|
| 1005 |
-
self.Section.SUMMARY.value,
|
| 1006 |
-
self.Section.DETAILS.value,
|
| 1007 |
-
self.Section.DRAWINGS.value,
|
| 1008 |
-
):
|
| 1009 |
-
# captions are siblings of abstract since no level info is provided
|
| 1010 |
-
head_item = PatentHeading.ABSTRACT
|
| 1011 |
-
self.level = head_item.level if head_item.level in self.parents else 1
|
| 1012 |
-
self.parents[self.level + 1] = self.doc.add_heading(
|
| 1013 |
-
value,
|
| 1014 |
-
level=self.level,
|
| 1015 |
-
parent=self.parents[self.level],
|
| 1016 |
-
)
|
| 1017 |
-
self.level += 1
|
| 1018 |
-
|
| 1019 |
-
elif field in (
|
| 1020 |
-
self.Field.PARAGRAPH.value,
|
| 1021 |
-
self.Field.PARAGRAPH_1.value,
|
| 1022 |
-
self.Field.PARAGRAPH_2.value,
|
| 1023 |
-
self.Field.PARAGRAPH_3.value,
|
| 1024 |
-
) and section in (
|
| 1025 |
-
self.Section.SUMMARY.value,
|
| 1026 |
-
self.Section.DETAILS.value,
|
| 1027 |
-
self.Section.DRAWINGS.value,
|
| 1028 |
-
):
|
| 1029 |
-
self.doc.add_text(
|
| 1030 |
-
label=DocItemLabel.PARAGRAPH,
|
| 1031 |
-
text=value,
|
| 1032 |
-
parent=self.parents[self.level],
|
| 1033 |
-
)
|
| 1034 |
-
|
| 1035 |
-
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
| 1036 |
-
self.doc = self.doc = DoclingDocument(name="file")
|
| 1037 |
-
section: str = ""
|
| 1038 |
-
key: str = ""
|
| 1039 |
-
value: str = ""
|
| 1040 |
-
line_num = 0
|
| 1041 |
-
for line in patent_content.splitlines():
|
| 1042 |
-
cols = re.split("\\s{2,}", line, maxsplit=1)
|
| 1043 |
-
if key and value and (len(cols) == 1 or (len(cols) == 2 and cols[0])):
|
| 1044 |
-
self.store_content(section, key, value)
|
| 1045 |
-
key = ""
|
| 1046 |
-
value = ""
|
| 1047 |
-
if len(cols) == 1: # section title
|
| 1048 |
-
section = cols[0]
|
| 1049 |
-
self.store_section(section)
|
| 1050 |
-
_log.debug(f"Parsing section {section}")
|
| 1051 |
-
elif len(cols) == 2: # key value
|
| 1052 |
-
if cols[0]: # key present
|
| 1053 |
-
key = cols[0]
|
| 1054 |
-
value = cols[1]
|
| 1055 |
-
elif not re.match(r"^##STR\d+##$", cols[1]): # line continues
|
| 1056 |
-
value += " " + cols[1]
|
| 1057 |
-
line_num += 1
|
| 1058 |
-
if key and value:
|
| 1059 |
-
self.store_content(section, key, value)
|
| 1060 |
-
|
| 1061 |
-
# TODO: parse tables
|
| 1062 |
-
return self.doc
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
class PatentUsptoAppV1(PatentUspto):
|
| 1066 |
-
"""Parser of patent documents from the US Patent Office (applications v1.x)
|
| 1067 |
-
|
| 1068 |
-
The compatible format is:
|
| 1069 |
-
- Patent Application Full Text Data/XML Version 1.x (from March 2001 till December
|
| 1070 |
-
2004)
|
| 1071 |
-
"""
|
| 1072 |
-
|
| 1073 |
-
@override
|
| 1074 |
-
def __init__(self) -> None:
|
| 1075 |
-
"""Build an instance of PatentUsptoAppV1 class."""
|
| 1076 |
-
self.handler = PatentUsptoAppV1.PatentHandler()
|
| 1077 |
-
self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
|
| 1078 |
-
|
| 1079 |
-
@override
|
| 1080 |
-
def parse(self, patent_content: str) -> Optional[DoclingDocument]:
|
| 1081 |
-
try:
|
| 1082 |
-
xml.sax.parseString(patent_content, self.handler)
|
| 1083 |
-
except xml.sax._exceptions.SAXParseException as exc_sax:
|
| 1084 |
-
_log.error(f"Error in parsing USPTO document: {exc_sax}")
|
| 1085 |
-
|
| 1086 |
-
return None
|
| 1087 |
-
|
| 1088 |
-
doc = self.handler.doc
|
| 1089 |
-
if doc:
|
| 1090 |
-
raw_tables = re.findall(self.pattern, patent_content)
|
| 1091 |
-
parsed_tables: list[TableData] = []
|
| 1092 |
-
_log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
|
| 1093 |
-
for table in raw_tables:
|
| 1094 |
-
table_parser = XmlTable(XML_DECLARATION + "\n" + table)
|
| 1095 |
-
try:
|
| 1096 |
-
table_data = table_parser.parse()
|
| 1097 |
-
if table_data:
|
| 1098 |
-
parsed_tables.append(table_data)
|
| 1099 |
-
except Exception as exc_table:
|
| 1100 |
-
_log.error(f"Error in parsing USPTO tables: {exc_table}")
|
| 1101 |
-
if len(parsed_tables) != len(doc.tables):
|
| 1102 |
-
_log.error(
|
| 1103 |
-
f"Number of referenced ({len(doc.tables)}) and parsed "
|
| 1104 |
-
f"({len(parsed_tables)}) tables differ."
|
| 1105 |
-
)
|
| 1106 |
-
else:
|
| 1107 |
-
for idx, item in enumerate(parsed_tables):
|
| 1108 |
-
doc.tables[idx].data = item
|
| 1109 |
-
|
| 1110 |
-
return doc
|
| 1111 |
-
|
| 1112 |
-
class PatentHandler(xml.sax.handler.ContentHandler):
|
| 1113 |
-
"""SAX ContentHandler for patent documents."""
|
| 1114 |
-
|
| 1115 |
-
APP_DOC_ELEMENT: Final = "patent-application-publication"
|
| 1116 |
-
|
| 1117 |
-
@unique
|
| 1118 |
-
class Element(Enum):
|
| 1119 |
-
"""Represents an element of interest in the patent application document."""
|
| 1120 |
-
|
| 1121 |
-
DRAWINGS = "brief-description-of-drawings", False
|
| 1122 |
-
ABSTRACT = "subdoc-abstract", False
|
| 1123 |
-
TITLE = "title-of-invention", True
|
| 1124 |
-
CLAIMS = "subdoc-claims", False
|
| 1125 |
-
CLAIM = "claim", False
|
| 1126 |
-
CLAIM_TEXT = "claim-text", True
|
| 1127 |
-
NUMBER = ("number", False)
|
| 1128 |
-
PARAGRAPH = "paragraph", True
|
| 1129 |
-
HEADING = "heading", True
|
| 1130 |
-
STYLE_SUPERSCRIPT = "superscript", True
|
| 1131 |
-
STYLE_SUBSCRIPT = "subscript", True
|
| 1132 |
-
# do not store text of a table, since it can be within paragraph
|
| 1133 |
-
TABLE = "table", False
|
| 1134 |
-
# do not store text of a formula, since it can be within paragraph
|
| 1135 |
-
MATH = "math-cwu", False
|
| 1136 |
-
|
| 1137 |
-
@override
|
| 1138 |
-
def __new__(cls, value: str, _) -> Self:
|
| 1139 |
-
obj = object.__new__(cls)
|
| 1140 |
-
obj._value_ = value
|
| 1141 |
-
return obj
|
| 1142 |
-
|
| 1143 |
-
@override
|
| 1144 |
-
def __init__(self, _, is_text: bool) -> None:
|
| 1145 |
-
self.is_text: bool = is_text
|
| 1146 |
-
|
| 1147 |
-
@override
|
| 1148 |
-
def __init__(self) -> None:
|
| 1149 |
-
"""Build an instance of the patent handler."""
|
| 1150 |
-
# Current patent being parsed
|
| 1151 |
-
self.doc: Optional[DoclingDocument] = None
|
| 1152 |
-
# Keep track of docling hierarchy level
|
| 1153 |
-
self.level: LevelNumber = 1
|
| 1154 |
-
# Keep track of docling parents by level
|
| 1155 |
-
self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
|
| 1156 |
-
# Content to retain for the current patent
|
| 1157 |
-
self.property: list[str]
|
| 1158 |
-
self.claim: str
|
| 1159 |
-
self.claims: list[str]
|
| 1160 |
-
self.abstract: str
|
| 1161 |
-
self.text: str
|
| 1162 |
-
self._clean_data()
|
| 1163 |
-
# To handle mathematical styling
|
| 1164 |
-
self.style_html = HtmlEntity()
|
| 1165 |
-
|
| 1166 |
-
@override
|
| 1167 |
-
def startElement(self, tag, attributes): # noqa: N802
|
| 1168 |
-
"""Signal the start of an element.
|
| 1169 |
-
|
| 1170 |
-
Args:
|
| 1171 |
-
tag: The element tag.
|
| 1172 |
-
attributes: The element attributes.
|
| 1173 |
-
"""
|
| 1174 |
-
if tag == self.APP_DOC_ELEMENT:
|
| 1175 |
-
self.doc = DoclingDocument(name="file")
|
| 1176 |
-
self.text = ""
|
| 1177 |
-
self._start_registered_elements(tag, attributes)
|
| 1178 |
-
|
| 1179 |
-
@override
|
| 1180 |
-
def skippedEntity(self, name): # noqa: N802
|
| 1181 |
-
"""Receive notification of a skipped entity.
|
| 1182 |
-
|
| 1183 |
-
HTML entities will be skipped by the parser. This method will unescape them
|
| 1184 |
-
and add them to the text.
|
| 1185 |
-
|
| 1186 |
-
Args:
|
| 1187 |
-
name: Entity name.
|
| 1188 |
-
"""
|
| 1189 |
-
if self.property:
|
| 1190 |
-
elm_val = self.property[-1]
|
| 1191 |
-
element = self.Element(elm_val)
|
| 1192 |
-
if element.is_text:
|
| 1193 |
-
escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
|
| 1194 |
-
unescaped = html.unescape(escaped)
|
| 1195 |
-
if unescaped == escaped:
|
| 1196 |
-
logging.debug("Unrecognized HTML entity: " + name)
|
| 1197 |
-
return
|
| 1198 |
-
|
| 1199 |
-
if element in (
|
| 1200 |
-
self.Element.STYLE_SUPERSCRIPT,
|
| 1201 |
-
self.Element.STYLE_SUBSCRIPT,
|
| 1202 |
-
):
|
| 1203 |
-
# superscripts and subscripts need to be under text elements
|
| 1204 |
-
if len(self.property) < 2:
|
| 1205 |
-
return
|
| 1206 |
-
parent_val = self.property[-2]
|
| 1207 |
-
parent = self.Element(parent_val)
|
| 1208 |
-
if parent.is_text:
|
| 1209 |
-
self.text += self._apply_style(unescaped, elm_val)
|
| 1210 |
-
else:
|
| 1211 |
-
self.text += unescaped
|
| 1212 |
-
|
| 1213 |
-
@override
|
| 1214 |
-
def endElement(self, tag): # noqa: N802
|
| 1215 |
-
"""Signal the end of an element.
|
| 1216 |
-
|
| 1217 |
-
Args:
|
| 1218 |
-
tag: The element tag.
|
| 1219 |
-
"""
|
| 1220 |
-
if tag == self.APP_DOC_ELEMENT:
|
| 1221 |
-
self._clean_data()
|
| 1222 |
-
self._end_registered_element(tag)
|
| 1223 |
-
|
| 1224 |
-
@override
|
| 1225 |
-
def characters(self, content):
|
| 1226 |
-
"""Receive notification of character data.
|
| 1227 |
-
|
| 1228 |
-
Args:
|
| 1229 |
-
content: Data reported by the handler.
|
| 1230 |
-
"""
|
| 1231 |
-
if self.property:
|
| 1232 |
-
elm_val = self.property[-1]
|
| 1233 |
-
element = self.Element(elm_val)
|
| 1234 |
-
if element.is_text:
|
| 1235 |
-
if element in (
|
| 1236 |
-
self.Element.STYLE_SUPERSCRIPT,
|
| 1237 |
-
self.Element.STYLE_SUBSCRIPT,
|
| 1238 |
-
):
|
| 1239 |
-
# superscripts and subscripts need to be under text elements
|
| 1240 |
-
if len(self.property) < 2:
|
| 1241 |
-
return
|
| 1242 |
-
parent_val = self.property[-2]
|
| 1243 |
-
parent = self.Element(parent_val)
|
| 1244 |
-
if parent.is_text:
|
| 1245 |
-
self.text += self._apply_style(content, elm_val)
|
| 1246 |
-
else:
|
| 1247 |
-
self.text += content
|
| 1248 |
-
|
| 1249 |
-
def _start_registered_elements(
|
| 1250 |
-
self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
|
| 1251 |
-
) -> None:
|
| 1252 |
-
if tag in [member.value for member in self.Element]:
|
| 1253 |
-
# special case for claims: claim lines may start before the
|
| 1254 |
-
# previous one is closed
|
| 1255 |
-
if (
|
| 1256 |
-
tag == self.Element.CLAIM_TEXT.value
|
| 1257 |
-
and self.property
|
| 1258 |
-
and self.property[-1] == tag
|
| 1259 |
-
and self.text.strip()
|
| 1260 |
-
):
|
| 1261 |
-
self.claim += " " + self.text.strip("\n")
|
| 1262 |
-
self.text = ""
|
| 1263 |
-
elif tag == self.Element.HEADING.value:
|
| 1264 |
-
level_attr: str = attributes.get("lvl", "")
|
| 1265 |
-
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
| 1266 |
-
max_level = min(self.parents.keys())
|
| 1267 |
-
# increase heading level with 1 for title, if any
|
| 1268 |
-
self.level = (
|
| 1269 |
-
new_level + 1 if (new_level + 1) in self.parents else max_level
|
| 1270 |
-
)
|
| 1271 |
-
self.property.append(tag)
|
| 1272 |
-
|
| 1273 |
-
def _end_registered_element(self, tag: str) -> None:
|
| 1274 |
-
if tag in [elm.value for elm in self.Element] and self.property:
|
| 1275 |
-
current_tag = self.property.pop()
|
| 1276 |
-
self._add_property(current_tag, self.text)
|
| 1277 |
-
|
| 1278 |
-
def _add_property(self, name: str, text: str) -> None:
|
| 1279 |
-
if not name or not self.doc:
|
| 1280 |
-
return
|
| 1281 |
-
|
| 1282 |
-
if name == self.Element.TITLE.value:
|
| 1283 |
-
title = text.strip()
|
| 1284 |
-
if title:
|
| 1285 |
-
self.parents[self.level + 1] = self.doc.add_text(
|
| 1286 |
-
parent=self.parents[self.level],
|
| 1287 |
-
label=DocItemLabel.TITLE,
|
| 1288 |
-
text=title,
|
| 1289 |
-
)
|
| 1290 |
-
self.level += 1
|
| 1291 |
-
self.text = ""
|
| 1292 |
-
elif name == self.Element.ABSTRACT.value:
|
| 1293 |
-
abstract = self.abstract.strip()
|
| 1294 |
-
if abstract:
|
| 1295 |
-
heading_text = PatentHeading.ABSTRACT.value
|
| 1296 |
-
heading_level = (
|
| 1297 |
-
PatentHeading.ABSTRACT.level
|
| 1298 |
-
if PatentHeading.ABSTRACT.level in self.parents
|
| 1299 |
-
else 1
|
| 1300 |
-
)
|
| 1301 |
-
abstract_item = self.doc.add_heading(
|
| 1302 |
-
heading_text,
|
| 1303 |
-
level=heading_level,
|
| 1304 |
-
parent=self.parents[heading_level],
|
| 1305 |
-
)
|
| 1306 |
-
self.doc.add_text(
|
| 1307 |
-
label=DocItemLabel.PARAGRAPH,
|
| 1308 |
-
text=self.abstract,
|
| 1309 |
-
parent=abstract_item,
|
| 1310 |
-
)
|
| 1311 |
-
self.abstract = ""
|
| 1312 |
-
self.text = ""
|
| 1313 |
-
elif name == self.Element.CLAIM_TEXT.value:
|
| 1314 |
-
if text:
|
| 1315 |
-
self.claim += self.text.strip("\n")
|
| 1316 |
-
self.text = ""
|
| 1317 |
-
|
| 1318 |
-
elif name == self.Element.CLAIM.value:
|
| 1319 |
-
claim = self.claim.strip()
|
| 1320 |
-
if claim:
|
| 1321 |
-
self.claims.append(claim)
|
| 1322 |
-
self.claim = ""
|
| 1323 |
-
|
| 1324 |
-
elif name == self.Element.CLAIMS.value and self.claims:
|
| 1325 |
-
heading_text = PatentHeading.CLAIMS.value
|
| 1326 |
-
heading_level = (
|
| 1327 |
-
PatentHeading.CLAIMS.level
|
| 1328 |
-
if PatentHeading.CLAIMS.level in self.parents
|
| 1329 |
-
else 1
|
| 1330 |
-
)
|
| 1331 |
-
claims_item = self.doc.add_heading(
|
| 1332 |
-
heading_text,
|
| 1333 |
-
level=heading_level,
|
| 1334 |
-
parent=self.parents[heading_level],
|
| 1335 |
-
)
|
| 1336 |
-
for text in self.claims:
|
| 1337 |
-
self.doc.add_text(
|
| 1338 |
-
label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
|
| 1339 |
-
)
|
| 1340 |
-
|
| 1341 |
-
elif name in (
|
| 1342 |
-
self.Element.PARAGRAPH.value,
|
| 1343 |
-
self.Element.HEADING.value,
|
| 1344 |
-
):
|
| 1345 |
-
if text and self.Element.ABSTRACT.value in self.property:
|
| 1346 |
-
self.abstract = (self.abstract + text) if self.abstract else text
|
| 1347 |
-
elif text.strip():
|
| 1348 |
-
text = re.sub("\\s+", " ", text).strip()
|
| 1349 |
-
if name == self.Element.HEADING.value:
|
| 1350 |
-
self.parents[self.level + 1] = self.doc.add_heading(
|
| 1351 |
-
text=text,
|
| 1352 |
-
level=self.level,
|
| 1353 |
-
parent=self.parents[self.level],
|
| 1354 |
-
)
|
| 1355 |
-
self.level += 1
|
| 1356 |
-
else:
|
| 1357 |
-
self.doc.add_text(
|
| 1358 |
-
label=DocItemLabel.PARAGRAPH,
|
| 1359 |
-
text=text,
|
| 1360 |
-
parent=self.parents[self.level],
|
| 1361 |
-
)
|
| 1362 |
-
self.text = ""
|
| 1363 |
-
|
| 1364 |
-
elif name == self.Element.TABLE.value:
|
| 1365 |
-
# set an empty table as placeholder
|
| 1366 |
-
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
| 1367 |
-
self.doc.add_table(
|
| 1368 |
-
data=empty_table,
|
| 1369 |
-
parent=self.parents[self.level],
|
| 1370 |
-
)
|
| 1371 |
-
|
| 1372 |
-
def _apply_style(self, text: str, style_tag: str) -> str:
|
| 1373 |
-
"""Apply an HTML style to text.
|
| 1374 |
-
|
| 1375 |
-
Args:
|
| 1376 |
-
text: A string containing plain text.
|
| 1377 |
-
style_tag: An HTML tag name for styling text. If the tag name is not
|
| 1378 |
-
recognized as one of the supported styles, the method will return
|
| 1379 |
-
the original `text`.
|
| 1380 |
-
|
| 1381 |
-
Returns:
|
| 1382 |
-
A string after applying the style.
|
| 1383 |
-
"""
|
| 1384 |
-
formatted = html.unescape(text)
|
| 1385 |
-
|
| 1386 |
-
if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
|
| 1387 |
-
formatted = html.unescape(self.style_html.get_superscript(formatted))
|
| 1388 |
-
elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
|
| 1389 |
-
formatted = html.unescape(self.style_html.get_subscript(formatted))
|
| 1390 |
-
|
| 1391 |
-
return formatted
|
| 1392 |
-
|
| 1393 |
-
def _clean_data(self):
|
| 1394 |
-
"""Reset the variables from stream data."""
|
| 1395 |
-
self.property = []
|
| 1396 |
-
self.abstract = ""
|
| 1397 |
-
self.claim = ""
|
| 1398 |
-
self.claims = []
|
| 1399 |
-
self.text = ""
|
| 1400 |
-
|
| 1401 |
-
|
| 1402 |
-
class XmlTable:
|
| 1403 |
-
"""Provide a table parser for xml tables in USPTO patent documents.
|
| 1404 |
-
|
| 1405 |
-
The OASIS Open XML Exchange Table Model can be downloaded from:
|
| 1406 |
-
http://oasis-open.org/specs/soextblx.dtd
|
| 1407 |
-
"""
|
| 1408 |
-
|
| 1409 |
-
class MinColInfoType(TypedDict):
|
| 1410 |
-
offset: list[int]
|
| 1411 |
-
colwidth: list[int]
|
| 1412 |
-
|
| 1413 |
-
class ColInfoType(MinColInfoType):
|
| 1414 |
-
cell_range: list[int]
|
| 1415 |
-
cell_offst: list[int]
|
| 1416 |
-
|
| 1417 |
-
def __init__(self, input: str) -> None:
|
| 1418 |
-
"""Initialize the table parser with the xml content.
|
| 1419 |
-
|
| 1420 |
-
Args:
|
| 1421 |
-
input: The xml content.
|
| 1422 |
-
"""
|
| 1423 |
-
self.max_nbr_messages = 2
|
| 1424 |
-
self.nbr_messages = 0
|
| 1425 |
-
self.empty_text = ""
|
| 1426 |
-
self._soup = BeautifulSoup(input, features="xml")
|
| 1427 |
-
|
| 1428 |
-
def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
|
| 1429 |
-
"""Create a unified range along the table groups.
|
| 1430 |
-
|
| 1431 |
-
Args:
|
| 1432 |
-
tgs: Table group column specifications.
|
| 1433 |
-
|
| 1434 |
-
Returns:
|
| 1435 |
-
Unified group column specifications.
|
| 1436 |
-
"""
|
| 1437 |
-
colinfo: dict[int, XmlTable.ColInfoType] = {}
|
| 1438 |
-
|
| 1439 |
-
if len(tgs) == 0:
|
| 1440 |
-
return colinfo
|
| 1441 |
-
|
| 1442 |
-
for itg, tg in enumerate(tgs):
|
| 1443 |
-
colinfo[itg] = {
|
| 1444 |
-
"offset": [],
|
| 1445 |
-
"colwidth": [],
|
| 1446 |
-
"cell_range": [],
|
| 1447 |
-
"cell_offst": [0],
|
| 1448 |
-
}
|
| 1449 |
-
offst = 0
|
| 1450 |
-
for info in tg["colinfo"]:
|
| 1451 |
-
cw = info["colwidth"]
|
| 1452 |
-
cw = re.sub("pt", "", cw, flags=re.I)
|
| 1453 |
-
cw = re.sub("mm", "", cw, flags=re.I)
|
| 1454 |
-
try:
|
| 1455 |
-
cw = int(cw)
|
| 1456 |
-
except BaseException:
|
| 1457 |
-
cw = float(cw)
|
| 1458 |
-
colinfo[itg]["colwidth"].append(cw)
|
| 1459 |
-
colinfo[itg]["offset"].append(offst)
|
| 1460 |
-
offst += cw
|
| 1461 |
-
colinfo[itg]["offset"].append(offst)
|
| 1462 |
-
|
| 1463 |
-
min_colinfo: XmlTable.MinColInfoType = {"offset": [], "colwidth": []}
|
| 1464 |
-
|
| 1465 |
-
min_colinfo["offset"] = colinfo[0]["offset"]
|
| 1466 |
-
offset_w0 = []
|
| 1467 |
-
for itg, col in colinfo.items():
|
| 1468 |
-
# keep track of col with 0 width
|
| 1469 |
-
for ic, cw in enumerate(col["colwidth"]):
|
| 1470 |
-
if cw == 0:
|
| 1471 |
-
offset_w0.append(col["offset"][ic])
|
| 1472 |
-
|
| 1473 |
-
min_colinfo["offset"] = sorted(
|
| 1474 |
-
list(set(col["offset"] + min_colinfo["offset"]))
|
| 1475 |
-
)
|
| 1476 |
-
|
| 1477 |
-
# add back the 0 width cols to offset list
|
| 1478 |
-
offset_w0 = list(set(offset_w0))
|
| 1479 |
-
min_colinfo["offset"] = sorted(min_colinfo["offset"] + offset_w0)
|
| 1480 |
-
|
| 1481 |
-
for i in range(len(min_colinfo["offset"]) - 1):
|
| 1482 |
-
min_colinfo["colwidth"].append(
|
| 1483 |
-
min_colinfo["offset"][i + 1] - min_colinfo["offset"][i]
|
| 1484 |
-
)
|
| 1485 |
-
|
| 1486 |
-
for itg, col in colinfo.items():
|
| 1487 |
-
i = 1
|
| 1488 |
-
range_ = 1
|
| 1489 |
-
for min_i in range(1, len(min_colinfo["offset"])):
|
| 1490 |
-
min_offst = min_colinfo["offset"][min_i]
|
| 1491 |
-
offst = col["offset"][i]
|
| 1492 |
-
if min_offst == offst:
|
| 1493 |
-
if (
|
| 1494 |
-
len(col["offset"]) == i + 1
|
| 1495 |
-
and len(min_colinfo["offset"]) > min_i + 1
|
| 1496 |
-
):
|
| 1497 |
-
range_ += 1
|
| 1498 |
-
else:
|
| 1499 |
-
col["cell_range"].append(range_)
|
| 1500 |
-
col["cell_offst"].append(col["cell_offst"][-1] + range_)
|
| 1501 |
-
range_ = 1
|
| 1502 |
-
i += 1
|
| 1503 |
-
elif min_offst < offst:
|
| 1504 |
-
range_ += 1
|
| 1505 |
-
else:
|
| 1506 |
-
_log.debug("A USPTO XML table has wrong offsets.")
|
| 1507 |
-
return {}
|
| 1508 |
-
|
| 1509 |
-
return colinfo
|
| 1510 |
-
|
| 1511 |
-
def _get_max_ncols(self, tgs_info: dict[int, ColInfoType]) -> NonNegativeInt:
|
| 1512 |
-
"""Get the maximum number of columns across table groups.
|
| 1513 |
-
|
| 1514 |
-
Args:
|
| 1515 |
-
tgs_info: Unified group column specifications.
|
| 1516 |
-
|
| 1517 |
-
Return:
|
| 1518 |
-
The maximum number of columns.
|
| 1519 |
-
"""
|
| 1520 |
-
ncols_max = 0
|
| 1521 |
-
for rowinfo in tgs_info.values():
|
| 1522 |
-
ncols_max = max(ncols_max, len(rowinfo["colwidth"]))
|
| 1523 |
-
|
| 1524 |
-
return ncols_max
|
| 1525 |
-
|
| 1526 |
-
def _parse_table(self, table: Tag) -> TableData:
|
| 1527 |
-
"""Parse the content of a table tag.
|
| 1528 |
-
|
| 1529 |
-
Args:
|
| 1530 |
-
The table element.
|
| 1531 |
-
|
| 1532 |
-
Returns:
|
| 1533 |
-
A docling table object.
|
| 1534 |
-
"""
|
| 1535 |
-
tgs_align = []
|
| 1536 |
-
tg_secs = table.find_all("tgroup")
|
| 1537 |
-
if tg_secs:
|
| 1538 |
-
for tg_sec in tg_secs:
|
| 1539 |
-
ncols = tg_sec.get("cols", None)
|
| 1540 |
-
if ncols:
|
| 1541 |
-
ncols = int(ncols)
|
| 1542 |
-
tg_align = {"ncols": ncols, "colinfo": []}
|
| 1543 |
-
cs_secs = tg_sec.find_all("colspec")
|
| 1544 |
-
if cs_secs:
|
| 1545 |
-
for cs_sec in cs_secs:
|
| 1546 |
-
colname = cs_sec.get("colname", None)
|
| 1547 |
-
colwidth = cs_sec.get("colwidth", None)
|
| 1548 |
-
tg_align["colinfo"].append(
|
| 1549 |
-
{"colname": colname, "colwidth": colwidth}
|
| 1550 |
-
)
|
| 1551 |
-
|
| 1552 |
-
tgs_align.append(tg_align)
|
| 1553 |
-
|
| 1554 |
-
# create unified range along the table groups
|
| 1555 |
-
tgs_range = self._create_tg_range(tgs_align)
|
| 1556 |
-
|
| 1557 |
-
# if the structure is broken, return an empty table
|
| 1558 |
-
if not tgs_range:
|
| 1559 |
-
dl_table = TableData(num_rows=0, num_cols=0, table_cells=[])
|
| 1560 |
-
return dl_table
|
| 1561 |
-
|
| 1562 |
-
ncols_max = self._get_max_ncols(tgs_range)
|
| 1563 |
-
|
| 1564 |
-
# extract table data
|
| 1565 |
-
table_data: list[TableCell] = []
|
| 1566 |
-
i_row_global = 0
|
| 1567 |
-
is_row_empty: bool = True
|
| 1568 |
-
tg_secs = table.find_all("tgroup")
|
| 1569 |
-
if tg_secs:
|
| 1570 |
-
for itg, tg_sec in enumerate(tg_secs):
|
| 1571 |
-
tg_range = tgs_range[itg]
|
| 1572 |
-
row_secs = tg_sec.find_all(["row", "tr"])
|
| 1573 |
-
|
| 1574 |
-
if row_secs:
|
| 1575 |
-
for row_sec in row_secs:
|
| 1576 |
-
entry_secs = row_sec.find_all(["entry", "td"])
|
| 1577 |
-
is_header: bool = row_sec.parent.name in ["thead"]
|
| 1578 |
-
|
| 1579 |
-
ncols = 0
|
| 1580 |
-
local_row: list[TableCell] = []
|
| 1581 |
-
is_row_empty = True
|
| 1582 |
-
if entry_secs:
|
| 1583 |
-
wrong_nbr_cols = False
|
| 1584 |
-
for ientry, entry_sec in enumerate(entry_secs):
|
| 1585 |
-
text = entry_sec.get_text().strip()
|
| 1586 |
-
|
| 1587 |
-
# start-end
|
| 1588 |
-
namest = entry_sec.attrs.get("namest", None)
|
| 1589 |
-
nameend = entry_sec.attrs.get("nameend", None)
|
| 1590 |
-
if isinstance(namest, str) and namest.isnumeric():
|
| 1591 |
-
namest = int(namest)
|
| 1592 |
-
else:
|
| 1593 |
-
namest = ientry + 1
|
| 1594 |
-
if isinstance(nameend, str) and nameend.isnumeric():
|
| 1595 |
-
nameend = int(nameend)
|
| 1596 |
-
shift = 0
|
| 1597 |
-
else:
|
| 1598 |
-
nameend = ientry + 2
|
| 1599 |
-
shift = 1
|
| 1600 |
-
|
| 1601 |
-
if nameend > len(tg_range["cell_offst"]):
|
| 1602 |
-
wrong_nbr_cols = True
|
| 1603 |
-
self.nbr_messages += 1
|
| 1604 |
-
if self.nbr_messages <= self.max_nbr_messages:
|
| 1605 |
-
_log.debug(
|
| 1606 |
-
"USPTO table has # entries != # columns"
|
| 1607 |
-
)
|
| 1608 |
-
break
|
| 1609 |
-
|
| 1610 |
-
range_ = [
|
| 1611 |
-
tg_range["cell_offst"][namest - 1],
|
| 1612 |
-
tg_range["cell_offst"][nameend - 1] - shift,
|
| 1613 |
-
]
|
| 1614 |
-
|
| 1615 |
-
# add row and replicate cell if needed
|
| 1616 |
-
cell_text = text if text else self.empty_text
|
| 1617 |
-
if cell_text != self.empty_text:
|
| 1618 |
-
is_row_empty = False
|
| 1619 |
-
for irep in range(range_[0], range_[1] + 1):
|
| 1620 |
-
ncols += 1
|
| 1621 |
-
local_row.append(
|
| 1622 |
-
TableCell(
|
| 1623 |
-
column_header=is_header,
|
| 1624 |
-
text=cell_text,
|
| 1625 |
-
start_row_offset_idx=i_row_global,
|
| 1626 |
-
end_row_offset_idx=i_row_global + 1,
|
| 1627 |
-
row_span=1,
|
| 1628 |
-
start_col_offset_idx=range_[0],
|
| 1629 |
-
end_col_offset_idx=range_[1] + 1,
|
| 1630 |
-
col_span=range_[1] - range_[0] + 1,
|
| 1631 |
-
)
|
| 1632 |
-
)
|
| 1633 |
-
|
| 1634 |
-
if wrong_nbr_cols:
|
| 1635 |
-
# keep empty text, not to introduce noise
|
| 1636 |
-
local_row = []
|
| 1637 |
-
ncols = 0
|
| 1638 |
-
|
| 1639 |
-
# add empty cell up to ncols_max
|
| 1640 |
-
for irep in range(ncols, ncols_max):
|
| 1641 |
-
local_row.append(
|
| 1642 |
-
TableCell(
|
| 1643 |
-
column_header=is_header,
|
| 1644 |
-
text=self.empty_text,
|
| 1645 |
-
start_row_offset_idx=i_row_global,
|
| 1646 |
-
end_row_offset_idx=i_row_global + 1,
|
| 1647 |
-
row_span=1,
|
| 1648 |
-
start_col_offset_idx=irep,
|
| 1649 |
-
end_col_offset_idx=irep + 1,
|
| 1650 |
-
col_span=1,
|
| 1651 |
-
)
|
| 1652 |
-
)
|
| 1653 |
-
# do not add empty rows
|
| 1654 |
-
if not is_row_empty:
|
| 1655 |
-
table_data.extend(local_row)
|
| 1656 |
-
i_row_global += 1
|
| 1657 |
-
|
| 1658 |
-
dl_table = TableData(
|
| 1659 |
-
num_rows=i_row_global, num_cols=ncols_max, table_cells=table_data
|
| 1660 |
-
)
|
| 1661 |
-
|
| 1662 |
-
return dl_table
|
| 1663 |
-
|
| 1664 |
-
def parse(self) -> Optional[TableData]:
|
| 1665 |
-
"""Parse the first table from an xml content.
|
| 1666 |
-
|
| 1667 |
-
Returns:
|
| 1668 |
-
A docling table data.
|
| 1669 |
-
"""
|
| 1670 |
-
section = self._soup.find("table")
|
| 1671 |
-
if section is not None:
|
| 1672 |
-
table = self._parse_table(section)
|
| 1673 |
-
if table.num_rows == 0 or table.num_cols == 0:
|
| 1674 |
-
_log.warning("The parsed USPTO table is empty")
|
| 1675 |
-
return table
|
| 1676 |
-
else:
|
| 1677 |
-
return None
|
| 1678 |
-
|
| 1679 |
-
|
| 1680 |
-
class HtmlEntity:
|
| 1681 |
-
"""Provide utility functions to get the HTML entities of styled characters.
|
| 1682 |
-
|
| 1683 |
-
This class has been developped from:
|
| 1684 |
-
https://unicode-table.com/en/html-entities/
|
| 1685 |
-
https://www.w3.org/TR/WD-math-970515/table03.html
|
| 1686 |
-
"""
|
| 1687 |
-
|
| 1688 |
-
def __init__(self):
|
| 1689 |
-
"""Initialize this class by loading the HTML entity dictionaries."""
|
| 1690 |
-
self.superscript = str.maketrans(
|
| 1691 |
-
{
|
| 1692 |
-
"1": "¹",
|
| 1693 |
-
"2": "²",
|
| 1694 |
-
"3": "³",
|
| 1695 |
-
"4": "⁴",
|
| 1696 |
-
"5": "⁵",
|
| 1697 |
-
"6": "⁶",
|
| 1698 |
-
"7": "⁷",
|
| 1699 |
-
"8": "⁸",
|
| 1700 |
-
"9": "⁹",
|
| 1701 |
-
"0": "⁰",
|
| 1702 |
-
"+": "⁺",
|
| 1703 |
-
"-": "⁻",
|
| 1704 |
-
"−": "⁻",
|
| 1705 |
-
"=": "⁼",
|
| 1706 |
-
"(": "⁽",
|
| 1707 |
-
")": "⁾",
|
| 1708 |
-
"a": "ª",
|
| 1709 |
-
"o": "º",
|
| 1710 |
-
"i": "ⁱ",
|
| 1711 |
-
"n": "ⁿ",
|
| 1712 |
-
}
|
| 1713 |
-
)
|
| 1714 |
-
self.subscript = str.maketrans(
|
| 1715 |
-
{
|
| 1716 |
-
"1": "₁",
|
| 1717 |
-
"2": "₂",
|
| 1718 |
-
"3": "₃",
|
| 1719 |
-
"4": "₄",
|
| 1720 |
-
"5": "₅",
|
| 1721 |
-
"6": "₆",
|
| 1722 |
-
"7": "₇",
|
| 1723 |
-
"8": "₈",
|
| 1724 |
-
"9": "₉",
|
| 1725 |
-
"0": "₀",
|
| 1726 |
-
"+": "₊",
|
| 1727 |
-
"-": "₋",
|
| 1728 |
-
"−": "₋",
|
| 1729 |
-
"=": "₌",
|
| 1730 |
-
"(": "₍",
|
| 1731 |
-
")": "₎",
|
| 1732 |
-
"a": "ₐ",
|
| 1733 |
-
"e": "ₑ",
|
| 1734 |
-
"o": "ₒ",
|
| 1735 |
-
"x": "ₓ",
|
| 1736 |
-
}
|
| 1737 |
-
)
|
| 1738 |
-
self.mathematical_italic = str.maketrans(
|
| 1739 |
-
{
|
| 1740 |
-
"A": "𝐴",
|
| 1741 |
-
"B": "𝐵",
|
| 1742 |
-
"C": "𝐶",
|
| 1743 |
-
"D": "𝐷",
|
| 1744 |
-
"E": "𝐸",
|
| 1745 |
-
"F": "𝐹",
|
| 1746 |
-
"G": "𝐺",
|
| 1747 |
-
"H": "𝐻",
|
| 1748 |
-
"I": "𝐼",
|
| 1749 |
-
"J": "𝐽",
|
| 1750 |
-
"K": "𝐾",
|
| 1751 |
-
"L": "𝐿",
|
| 1752 |
-
"M": "𝑀",
|
| 1753 |
-
"N": "𝑁",
|
| 1754 |
-
"O": "𝑂",
|
| 1755 |
-
"P": "𝑃",
|
| 1756 |
-
"Q": "𝑄",
|
| 1757 |
-
"R": "𝑅",
|
| 1758 |
-
"S": "𝑆",
|
| 1759 |
-
"T": "𝑇",
|
| 1760 |
-
"U": "𝑈",
|
| 1761 |
-
"V": "𝑉",
|
| 1762 |
-
"W": "𝑊",
|
| 1763 |
-
"Y": "𝑌",
|
| 1764 |
-
"Z": "𝑍",
|
| 1765 |
-
"a": "𝑎",
|
| 1766 |
-
"b": "𝑏",
|
| 1767 |
-
"c": "𝑐",
|
| 1768 |
-
"d": "𝑑",
|
| 1769 |
-
"e": "𝑒",
|
| 1770 |
-
"f": "𝑓",
|
| 1771 |
-
"g": "𝑔",
|
| 1772 |
-
"h": "𝑕",
|
| 1773 |
-
"i": "𝑖",
|
| 1774 |
-
"j": "𝑗",
|
| 1775 |
-
"k": "𝑘",
|
| 1776 |
-
"l": "𝑙",
|
| 1777 |
-
"m": "𝑚",
|
| 1778 |
-
"n": "𝑛",
|
| 1779 |
-
"o": "𝑜",
|
| 1780 |
-
"p": "𝑝",
|
| 1781 |
-
"q": "𝑞",
|
| 1782 |
-
"r": "𝑟",
|
| 1783 |
-
"s": "𝑠",
|
| 1784 |
-
"t": "𝑡",
|
| 1785 |
-
"u": "𝑢",
|
| 1786 |
-
"v": "𝑣",
|
| 1787 |
-
"w": "𝑤",
|
| 1788 |
-
"x": "𝑥",
|
| 1789 |
-
"y": "𝑦",
|
| 1790 |
-
"z": "𝑧",
|
| 1791 |
-
}
|
| 1792 |
-
)
|
| 1793 |
-
|
| 1794 |
-
self.lookup_iso8879 = {
|
| 1795 |
-
"&Agr;": "Α",
|
| 1796 |
-
"&Bgr;": "Β",
|
| 1797 |
-
"&Ggr;": "Γ",
|
| 1798 |
-
"&Dgr;": "Δ",
|
| 1799 |
-
"&Egr;": "Ε",
|
| 1800 |
-
"&Zgr;": "Ζ",
|
| 1801 |
-
"&EEgr;": "Η",
|
| 1802 |
-
"&THgr;": "Θ",
|
| 1803 |
-
"&Igr;": "Ι",
|
| 1804 |
-
"&Kgr;": "Κ",
|
| 1805 |
-
"&Lgr;": "Λ",
|
| 1806 |
-
"&Mgr;": "Μ",
|
| 1807 |
-
"&Ngr;": "Ν",
|
| 1808 |
-
"&Xgr;": "Ξ",
|
| 1809 |
-
"&Ogr;": "Ο",
|
| 1810 |
-
"&Pgr;": "Π",
|
| 1811 |
-
"&Rgr;": "Ρ",
|
| 1812 |
-
"&Sgr;": "Σ",
|
| 1813 |
-
"&Tgr;": "Τ",
|
| 1814 |
-
"&Ugr;": "Υ",
|
| 1815 |
-
"&PHgr;": "Φ",
|
| 1816 |
-
"&KHgr;": "Χ",
|
| 1817 |
-
"&PSgr;": "Ψ",
|
| 1818 |
-
"&OHgr;": "Ω",
|
| 1819 |
-
"&agr;": "α",
|
| 1820 |
-
"&bgr;": "β",
|
| 1821 |
-
"&ggr;": "γ",
|
| 1822 |
-
"&dgr;": "δ",
|
| 1823 |
-
"&egr;": "ε",
|
| 1824 |
-
"&zgr;": "ζ",
|
| 1825 |
-
"&eegr;": "η",
|
| 1826 |
-
"&thgr;": "θ",
|
| 1827 |
-
"&igr;": "ι",
|
| 1828 |
-
"&kgr;": "κ",
|
| 1829 |
-
"&lgr;": "λ",
|
| 1830 |
-
"&mgr;": "μ",
|
| 1831 |
-
"&ngr;": "ν",
|
| 1832 |
-
"&xgr;": "ξ",
|
| 1833 |
-
"&ogr;": "ο",
|
| 1834 |
-
"&pgr;": "π",
|
| 1835 |
-
"&rgr;": "ρ",
|
| 1836 |
-
"&sgr;": "ς",
|
| 1837 |
-
"&tgr;": "τ",
|
| 1838 |
-
"&ugr;": "υ",
|
| 1839 |
-
"&phgr;": "φ",
|
| 1840 |
-
"&khgr;": "χ",
|
| 1841 |
-
"&psgr;": "ψ",
|
| 1842 |
-
"&ohgr;": "ω",
|
| 1843 |
-
}
|
| 1844 |
-
|
| 1845 |
-
def get_superscript(self, text: str) -> str:
|
| 1846 |
-
"""Get a text in superscript as HTML entities.
|
| 1847 |
-
|
| 1848 |
-
Args:
|
| 1849 |
-
text: The text to transform.
|
| 1850 |
-
|
| 1851 |
-
Returns:
|
| 1852 |
-
The text in superscript as HTML entities.
|
| 1853 |
-
"""
|
| 1854 |
-
return text.translate(self.superscript)
|
| 1855 |
-
|
| 1856 |
-
def get_subscript(self, text: str) -> str:
|
| 1857 |
-
"""Get a text in subscript as HTML entities.
|
| 1858 |
-
|
| 1859 |
-
Args:
|
| 1860 |
-
The text to transform.
|
| 1861 |
-
|
| 1862 |
-
Returns:
|
| 1863 |
-
The text in subscript as HTML entities.
|
| 1864 |
-
"""
|
| 1865 |
-
return text.translate(self.subscript)
|
| 1866 |
-
|
| 1867 |
-
def get_math_italic(self, text: str) -> str:
|
| 1868 |
-
"""Get a text in italic as HTML entities.
|
| 1869 |
-
|
| 1870 |
-
Args:
|
| 1871 |
-
The text to transform.
|
| 1872 |
-
|
| 1873 |
-
Returns:
|
| 1874 |
-
The text in italics as HTML entities.
|
| 1875 |
-
"""
|
| 1876 |
-
return text.translate(self.mathematical_italic)
|
| 1877 |
-
|
| 1878 |
-
def get_greek_from_iso8879(self, text: str) -> str:
|
| 1879 |
-
"""Get an HTML entity of a greek letter in ISO 8879.
|
| 1880 |
-
|
| 1881 |
-
Args:
|
| 1882 |
-
The text to transform, as an ISO 8879 entitiy.
|
| 1883 |
-
|
| 1884 |
-
Returns:
|
| 1885 |
-
The HTML entity representing a greek letter. If the input text is not
|
| 1886 |
-
supported, the original text is returned.
|
| 1887 |
-
"""
|
| 1888 |
-
return self.lookup_iso8879.get(text, text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/chunking/__init__.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
#
|
| 2 |
-
# Copyright IBM Corp. 2024 - 2024
|
| 3 |
-
# SPDX-License-Identifier: MIT
|
| 4 |
-
#
|
| 5 |
-
|
| 6 |
-
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
| 7 |
-
from docling_core.transforms.chunker.hierarchical_chunker import (
|
| 8 |
-
DocChunk,
|
| 9 |
-
DocMeta,
|
| 10 |
-
HierarchicalChunker,
|
| 11 |
-
)
|
| 12 |
-
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/cli/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/cli/main.py
DELETED
|
@@ -1,456 +0,0 @@
|
|
| 1 |
-
import importlib
|
| 2 |
-
import logging
|
| 3 |
-
import platform
|
| 4 |
-
import re
|
| 5 |
-
import sys
|
| 6 |
-
import tempfile
|
| 7 |
-
import time
|
| 8 |
-
import warnings
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
| 11 |
-
|
| 12 |
-
import typer
|
| 13 |
-
from docling_core.types.doc import ImageRefMode
|
| 14 |
-
from docling_core.utils.file import resolve_source_to_path
|
| 15 |
-
from pydantic import TypeAdapter
|
| 16 |
-
|
| 17 |
-
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
| 18 |
-
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
| 19 |
-
from docling.backend.pdf_backend import PdfDocumentBackend
|
| 20 |
-
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
| 21 |
-
from docling.datamodel.base_models import (
|
| 22 |
-
ConversionStatus,
|
| 23 |
-
FormatToExtensions,
|
| 24 |
-
InputFormat,
|
| 25 |
-
OutputFormat,
|
| 26 |
-
)
|
| 27 |
-
from docling.datamodel.document import ConversionResult
|
| 28 |
-
from docling.datamodel.pipeline_options import (
|
| 29 |
-
AcceleratorDevice,
|
| 30 |
-
AcceleratorOptions,
|
| 31 |
-
EasyOcrOptions,
|
| 32 |
-
OcrEngine,
|
| 33 |
-
OcrMacOptions,
|
| 34 |
-
OcrOptions,
|
| 35 |
-
PdfBackend,
|
| 36 |
-
PdfPipelineOptions,
|
| 37 |
-
RapidOcrOptions,
|
| 38 |
-
TableFormerMode,
|
| 39 |
-
TesseractCliOcrOptions,
|
| 40 |
-
TesseractOcrOptions,
|
| 41 |
-
)
|
| 42 |
-
from docling.datamodel.settings import settings
|
| 43 |
-
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
| 44 |
-
|
| 45 |
-
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
| 46 |
-
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
| 47 |
-
|
| 48 |
-
_log = logging.getLogger(__name__)
|
| 49 |
-
from rich.console import Console
|
| 50 |
-
|
| 51 |
-
err_console = Console(stderr=True)
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
app = typer.Typer(
|
| 55 |
-
name="Docling",
|
| 56 |
-
no_args_is_help=True,
|
| 57 |
-
add_completion=False,
|
| 58 |
-
pretty_exceptions_enable=False,
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def version_callback(value: bool):
|
| 63 |
-
if value:
|
| 64 |
-
docling_version = importlib.metadata.version("docling")
|
| 65 |
-
docling_core_version = importlib.metadata.version("docling-core")
|
| 66 |
-
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
| 67 |
-
docling_parse_version = importlib.metadata.version("docling-parse")
|
| 68 |
-
platform_str = platform.platform()
|
| 69 |
-
py_impl_version = sys.implementation.cache_tag
|
| 70 |
-
py_lang_version = platform.python_version()
|
| 71 |
-
print(f"Docling version: {docling_version}")
|
| 72 |
-
print(f"Docling Core version: {docling_core_version}")
|
| 73 |
-
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
| 74 |
-
print(f"Docling Parse version: {docling_parse_version}")
|
| 75 |
-
print(f"Python: {py_impl_version} ({py_lang_version})")
|
| 76 |
-
print(f"Platform: {platform_str}")
|
| 77 |
-
raise typer.Exit()
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def export_documents(
|
| 81 |
-
conv_results: Iterable[ConversionResult],
|
| 82 |
-
output_dir: Path,
|
| 83 |
-
export_json: bool,
|
| 84 |
-
export_html: bool,
|
| 85 |
-
export_md: bool,
|
| 86 |
-
export_txt: bool,
|
| 87 |
-
export_doctags: bool,
|
| 88 |
-
image_export_mode: ImageRefMode,
|
| 89 |
-
):
|
| 90 |
-
|
| 91 |
-
success_count = 0
|
| 92 |
-
failure_count = 0
|
| 93 |
-
|
| 94 |
-
for conv_res in conv_results:
|
| 95 |
-
if conv_res.status == ConversionStatus.SUCCESS:
|
| 96 |
-
success_count += 1
|
| 97 |
-
doc_filename = conv_res.input.file.stem
|
| 98 |
-
|
| 99 |
-
# Export JSON format:
|
| 100 |
-
if export_json:
|
| 101 |
-
fname = output_dir / f"{doc_filename}.json"
|
| 102 |
-
_log.info(f"writing JSON output to {fname}")
|
| 103 |
-
conv_res.document.save_as_json(
|
| 104 |
-
filename=fname, image_mode=image_export_mode
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
# Export HTML format:
|
| 108 |
-
if export_html:
|
| 109 |
-
fname = output_dir / f"{doc_filename}.html"
|
| 110 |
-
_log.info(f"writing HTML output to {fname}")
|
| 111 |
-
conv_res.document.save_as_html(
|
| 112 |
-
filename=fname, image_mode=image_export_mode
|
| 113 |
-
)
|
| 114 |
-
|
| 115 |
-
# Export Text format:
|
| 116 |
-
if export_txt:
|
| 117 |
-
fname = output_dir / f"{doc_filename}.txt"
|
| 118 |
-
_log.info(f"writing TXT output to {fname}")
|
| 119 |
-
conv_res.document.save_as_markdown(
|
| 120 |
-
filename=fname,
|
| 121 |
-
strict_text=True,
|
| 122 |
-
image_mode=ImageRefMode.PLACEHOLDER,
|
| 123 |
-
)
|
| 124 |
-
|
| 125 |
-
# Export Markdown format:
|
| 126 |
-
if export_md:
|
| 127 |
-
fname = output_dir / f"{doc_filename}.md"
|
| 128 |
-
_log.info(f"writing Markdown output to {fname}")
|
| 129 |
-
conv_res.document.save_as_markdown(
|
| 130 |
-
filename=fname, image_mode=image_export_mode
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
# Export Document Tags format:
|
| 134 |
-
if export_doctags:
|
| 135 |
-
fname = output_dir / f"{doc_filename}.doctags"
|
| 136 |
-
_log.info(f"writing Doc Tags output to {fname}")
|
| 137 |
-
conv_res.document.save_as_document_tokens(filename=fname)
|
| 138 |
-
|
| 139 |
-
else:
|
| 140 |
-
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
| 141 |
-
failure_count += 1
|
| 142 |
-
|
| 143 |
-
_log.info(
|
| 144 |
-
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
| 149 |
-
if raw is None:
|
| 150 |
-
return None
|
| 151 |
-
return re.split(r"[;,]", raw)
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
@app.command(no_args_is_help=True)
|
| 155 |
-
def convert(
|
| 156 |
-
input_sources: Annotated[
|
| 157 |
-
List[str],
|
| 158 |
-
typer.Argument(
|
| 159 |
-
...,
|
| 160 |
-
metavar="source",
|
| 161 |
-
help="PDF files to convert. Can be local file / directory paths or URL.",
|
| 162 |
-
),
|
| 163 |
-
],
|
| 164 |
-
from_formats: List[InputFormat] = typer.Option(
|
| 165 |
-
None,
|
| 166 |
-
"--from",
|
| 167 |
-
help="Specify input formats to convert from. Defaults to all formats.",
|
| 168 |
-
),
|
| 169 |
-
to_formats: List[OutputFormat] = typer.Option(
|
| 170 |
-
None, "--to", help="Specify output formats. Defaults to Markdown."
|
| 171 |
-
),
|
| 172 |
-
headers: str = typer.Option(
|
| 173 |
-
None,
|
| 174 |
-
"--headers",
|
| 175 |
-
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
|
| 176 |
-
),
|
| 177 |
-
image_export_mode: Annotated[
|
| 178 |
-
ImageRefMode,
|
| 179 |
-
typer.Option(
|
| 180 |
-
...,
|
| 181 |
-
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
| 182 |
-
),
|
| 183 |
-
] = ImageRefMode.EMBEDDED,
|
| 184 |
-
ocr: Annotated[
|
| 185 |
-
bool,
|
| 186 |
-
typer.Option(
|
| 187 |
-
..., help="If enabled, the bitmap content will be processed using OCR."
|
| 188 |
-
),
|
| 189 |
-
] = True,
|
| 190 |
-
force_ocr: Annotated[
|
| 191 |
-
bool,
|
| 192 |
-
typer.Option(
|
| 193 |
-
...,
|
| 194 |
-
help="Replace any existing text with OCR generated text over the full content.",
|
| 195 |
-
),
|
| 196 |
-
] = False,
|
| 197 |
-
ocr_engine: Annotated[
|
| 198 |
-
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
| 199 |
-
] = OcrEngine.EASYOCR,
|
| 200 |
-
ocr_lang: Annotated[
|
| 201 |
-
Optional[str],
|
| 202 |
-
typer.Option(
|
| 203 |
-
...,
|
| 204 |
-
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
| 205 |
-
),
|
| 206 |
-
] = None,
|
| 207 |
-
pdf_backend: Annotated[
|
| 208 |
-
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
| 209 |
-
] = PdfBackend.DLPARSE_V2,
|
| 210 |
-
table_mode: Annotated[
|
| 211 |
-
TableFormerMode,
|
| 212 |
-
typer.Option(..., help="The mode to use in the table structure model."),
|
| 213 |
-
] = TableFormerMode.FAST,
|
| 214 |
-
enrich_code: Annotated[
|
| 215 |
-
bool,
|
| 216 |
-
typer.Option(..., help="Enable the code enrichment model in the pipeline."),
|
| 217 |
-
] = False,
|
| 218 |
-
enrich_formula: Annotated[
|
| 219 |
-
bool,
|
| 220 |
-
typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
|
| 221 |
-
] = False,
|
| 222 |
-
enrich_picture_classes: Annotated[
|
| 223 |
-
bool,
|
| 224 |
-
typer.Option(
|
| 225 |
-
...,
|
| 226 |
-
help="Enable the picture classification enrichment model in the pipeline.",
|
| 227 |
-
),
|
| 228 |
-
] = False,
|
| 229 |
-
enrich_picture_description: Annotated[
|
| 230 |
-
bool,
|
| 231 |
-
typer.Option(..., help="Enable the picture description model in the pipeline."),
|
| 232 |
-
] = False,
|
| 233 |
-
artifacts_path: Annotated[
|
| 234 |
-
Optional[Path],
|
| 235 |
-
typer.Option(..., help="If provided, the location of the model artifacts."),
|
| 236 |
-
] = None,
|
| 237 |
-
abort_on_error: Annotated[
|
| 238 |
-
bool,
|
| 239 |
-
typer.Option(
|
| 240 |
-
...,
|
| 241 |
-
"--abort-on-error/--no-abort-on-error",
|
| 242 |
-
help="If enabled, the bitmap content will be processed using OCR.",
|
| 243 |
-
),
|
| 244 |
-
] = False,
|
| 245 |
-
output: Annotated[
|
| 246 |
-
Path, typer.Option(..., help="Output directory where results are saved.")
|
| 247 |
-
] = Path("."),
|
| 248 |
-
verbose: Annotated[
|
| 249 |
-
int,
|
| 250 |
-
typer.Option(
|
| 251 |
-
"--verbose",
|
| 252 |
-
"-v",
|
| 253 |
-
count=True,
|
| 254 |
-
help="Set the verbosity level. -v for info logging, -vv for debug logging.",
|
| 255 |
-
),
|
| 256 |
-
] = 0,
|
| 257 |
-
debug_visualize_cells: Annotated[
|
| 258 |
-
bool,
|
| 259 |
-
typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
|
| 260 |
-
] = False,
|
| 261 |
-
debug_visualize_ocr: Annotated[
|
| 262 |
-
bool,
|
| 263 |
-
typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
|
| 264 |
-
] = False,
|
| 265 |
-
debug_visualize_layout: Annotated[
|
| 266 |
-
bool,
|
| 267 |
-
typer.Option(
|
| 268 |
-
..., help="Enable debug output which visualizes the layour clusters"
|
| 269 |
-
),
|
| 270 |
-
] = False,
|
| 271 |
-
debug_visualize_tables: Annotated[
|
| 272 |
-
bool,
|
| 273 |
-
typer.Option(..., help="Enable debug output which visualizes the table cells"),
|
| 274 |
-
] = False,
|
| 275 |
-
version: Annotated[
|
| 276 |
-
Optional[bool],
|
| 277 |
-
typer.Option(
|
| 278 |
-
"--version",
|
| 279 |
-
callback=version_callback,
|
| 280 |
-
is_eager=True,
|
| 281 |
-
help="Show version information.",
|
| 282 |
-
),
|
| 283 |
-
] = None,
|
| 284 |
-
document_timeout: Annotated[
|
| 285 |
-
Optional[float],
|
| 286 |
-
typer.Option(
|
| 287 |
-
...,
|
| 288 |
-
help="The timeout for processing each document, in seconds.",
|
| 289 |
-
),
|
| 290 |
-
] = None,
|
| 291 |
-
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
|
| 292 |
-
device: Annotated[
|
| 293 |
-
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
| 294 |
-
] = AcceleratorDevice.AUTO,
|
| 295 |
-
):
|
| 296 |
-
if verbose == 0:
|
| 297 |
-
logging.basicConfig(level=logging.WARNING)
|
| 298 |
-
elif verbose == 1:
|
| 299 |
-
logging.basicConfig(level=logging.INFO)
|
| 300 |
-
elif verbose == 2:
|
| 301 |
-
logging.basicConfig(level=logging.DEBUG)
|
| 302 |
-
|
| 303 |
-
settings.debug.visualize_cells = debug_visualize_cells
|
| 304 |
-
settings.debug.visualize_layout = debug_visualize_layout
|
| 305 |
-
settings.debug.visualize_tables = debug_visualize_tables
|
| 306 |
-
settings.debug.visualize_ocr = debug_visualize_ocr
|
| 307 |
-
|
| 308 |
-
if from_formats is None:
|
| 309 |
-
from_formats = [e for e in InputFormat]
|
| 310 |
-
|
| 311 |
-
parsed_headers: Optional[Dict[str, str]] = None
|
| 312 |
-
if headers is not None:
|
| 313 |
-
headers_t = TypeAdapter(Dict[str, str])
|
| 314 |
-
parsed_headers = headers_t.validate_json(headers)
|
| 315 |
-
|
| 316 |
-
with tempfile.TemporaryDirectory() as tempdir:
|
| 317 |
-
input_doc_paths: List[Path] = []
|
| 318 |
-
for src in input_sources:
|
| 319 |
-
try:
|
| 320 |
-
# check if we can fetch some remote url
|
| 321 |
-
source = resolve_source_to_path(
|
| 322 |
-
source=src, headers=parsed_headers, workdir=Path(tempdir)
|
| 323 |
-
)
|
| 324 |
-
input_doc_paths.append(source)
|
| 325 |
-
except FileNotFoundError:
|
| 326 |
-
err_console.print(
|
| 327 |
-
f"[red]Error: The input file {src} does not exist.[/red]"
|
| 328 |
-
)
|
| 329 |
-
raise typer.Abort()
|
| 330 |
-
except IsADirectoryError:
|
| 331 |
-
# if the input matches to a file or a folder
|
| 332 |
-
try:
|
| 333 |
-
local_path = TypeAdapter(Path).validate_python(src)
|
| 334 |
-
if local_path.exists() and local_path.is_dir():
|
| 335 |
-
for fmt in from_formats:
|
| 336 |
-
for ext in FormatToExtensions[fmt]:
|
| 337 |
-
input_doc_paths.extend(
|
| 338 |
-
list(local_path.glob(f"**/*.{ext}"))
|
| 339 |
-
)
|
| 340 |
-
input_doc_paths.extend(
|
| 341 |
-
list(local_path.glob(f"**/*.{ext.upper()}"))
|
| 342 |
-
)
|
| 343 |
-
elif local_path.exists():
|
| 344 |
-
input_doc_paths.append(local_path)
|
| 345 |
-
else:
|
| 346 |
-
err_console.print(
|
| 347 |
-
f"[red]Error: The input file {src} does not exist.[/red]"
|
| 348 |
-
)
|
| 349 |
-
raise typer.Abort()
|
| 350 |
-
except Exception as err:
|
| 351 |
-
err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
|
| 352 |
-
_log.info(err) # will print more details if verbose is activated
|
| 353 |
-
raise typer.Abort()
|
| 354 |
-
|
| 355 |
-
if to_formats is None:
|
| 356 |
-
to_formats = [OutputFormat.MARKDOWN]
|
| 357 |
-
|
| 358 |
-
export_json = OutputFormat.JSON in to_formats
|
| 359 |
-
export_html = OutputFormat.HTML in to_formats
|
| 360 |
-
export_md = OutputFormat.MARKDOWN in to_formats
|
| 361 |
-
export_txt = OutputFormat.TEXT in to_formats
|
| 362 |
-
export_doctags = OutputFormat.DOCTAGS in to_formats
|
| 363 |
-
|
| 364 |
-
if ocr_engine == OcrEngine.EASYOCR:
|
| 365 |
-
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
| 366 |
-
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
| 367 |
-
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
| 368 |
-
elif ocr_engine == OcrEngine.TESSERACT:
|
| 369 |
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
| 370 |
-
elif ocr_engine == OcrEngine.OCRMAC:
|
| 371 |
-
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
| 372 |
-
elif ocr_engine == OcrEngine.RAPIDOCR:
|
| 373 |
-
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
| 374 |
-
else:
|
| 375 |
-
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
| 376 |
-
|
| 377 |
-
ocr_lang_list = _split_list(ocr_lang)
|
| 378 |
-
if ocr_lang_list is not None:
|
| 379 |
-
ocr_options.lang = ocr_lang_list
|
| 380 |
-
|
| 381 |
-
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
| 382 |
-
pipeline_options = PdfPipelineOptions(
|
| 383 |
-
accelerator_options=accelerator_options,
|
| 384 |
-
do_ocr=ocr,
|
| 385 |
-
ocr_options=ocr_options,
|
| 386 |
-
do_table_structure=True,
|
| 387 |
-
do_code_enrichment=enrich_code,
|
| 388 |
-
do_formula_enrichment=enrich_formula,
|
| 389 |
-
do_picture_description=enrich_picture_description,
|
| 390 |
-
do_picture_classification=enrich_picture_classes,
|
| 391 |
-
document_timeout=document_timeout,
|
| 392 |
-
)
|
| 393 |
-
pipeline_options.table_structure_options.do_cell_matching = (
|
| 394 |
-
True # do_cell_matching
|
| 395 |
-
)
|
| 396 |
-
pipeline_options.table_structure_options.mode = table_mode
|
| 397 |
-
|
| 398 |
-
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
| 399 |
-
pipeline_options.generate_page_images = True
|
| 400 |
-
pipeline_options.generate_picture_images = (
|
| 401 |
-
True # FIXME: to be deprecated in verson 3
|
| 402 |
-
)
|
| 403 |
-
pipeline_options.images_scale = 2
|
| 404 |
-
|
| 405 |
-
if artifacts_path is not None:
|
| 406 |
-
pipeline_options.artifacts_path = artifacts_path
|
| 407 |
-
|
| 408 |
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
| 409 |
-
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
| 410 |
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
| 411 |
-
backend = DoclingParseV2DocumentBackend
|
| 412 |
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
| 413 |
-
backend = PyPdfiumDocumentBackend
|
| 414 |
-
else:
|
| 415 |
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
| 416 |
-
|
| 417 |
-
pdf_format_option = PdfFormatOption(
|
| 418 |
-
pipeline_options=pipeline_options,
|
| 419 |
-
backend=backend, # pdf_backend
|
| 420 |
-
)
|
| 421 |
-
format_options: Dict[InputFormat, FormatOption] = {
|
| 422 |
-
InputFormat.PDF: pdf_format_option,
|
| 423 |
-
InputFormat.IMAGE: pdf_format_option,
|
| 424 |
-
}
|
| 425 |
-
doc_converter = DocumentConverter(
|
| 426 |
-
allowed_formats=from_formats,
|
| 427 |
-
format_options=format_options,
|
| 428 |
-
)
|
| 429 |
-
|
| 430 |
-
start_time = time.time()
|
| 431 |
-
|
| 432 |
-
conv_results = doc_converter.convert_all(
|
| 433 |
-
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
| 434 |
-
)
|
| 435 |
-
|
| 436 |
-
output.mkdir(parents=True, exist_ok=True)
|
| 437 |
-
export_documents(
|
| 438 |
-
conv_results,
|
| 439 |
-
output_dir=output,
|
| 440 |
-
export_json=export_json,
|
| 441 |
-
export_html=export_html,
|
| 442 |
-
export_md=export_md,
|
| 443 |
-
export_txt=export_txt,
|
| 444 |
-
export_doctags=export_doctags,
|
| 445 |
-
image_export_mode=image_export_mode,
|
| 446 |
-
)
|
| 447 |
-
|
| 448 |
-
end_time = time.time() - start_time
|
| 449 |
-
|
| 450 |
-
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
click_app = typer.main.get_command(app)
|
| 454 |
-
|
| 455 |
-
if __name__ == "__main__":
|
| 456 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/cli/models.py
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import warnings
|
| 3 |
-
from enum import Enum
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Annotated, Optional
|
| 6 |
-
|
| 7 |
-
import typer
|
| 8 |
-
from rich.console import Console
|
| 9 |
-
from rich.logging import RichHandler
|
| 10 |
-
|
| 11 |
-
from docling.datamodel.settings import settings
|
| 12 |
-
from docling.utils.model_downloader import download_models
|
| 13 |
-
|
| 14 |
-
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
| 15 |
-
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
| 16 |
-
|
| 17 |
-
console = Console()
|
| 18 |
-
err_console = Console(stderr=True)
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
app = typer.Typer(
|
| 22 |
-
name="Docling models helper",
|
| 23 |
-
no_args_is_help=True,
|
| 24 |
-
add_completion=False,
|
| 25 |
-
pretty_exceptions_enable=False,
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
class _AvailableModels(str, Enum):
|
| 30 |
-
LAYOUT = "layout"
|
| 31 |
-
TABLEFORMER = "tableformer"
|
| 32 |
-
CODE_FORMULA = "code_formula"
|
| 33 |
-
PICTURE_CLASSIFIER = "picture_classifier"
|
| 34 |
-
SMOLVLM = "smolvlm"
|
| 35 |
-
EASYOCR = "easyocr"
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
@app.command("download")
|
| 39 |
-
def download(
|
| 40 |
-
output_dir: Annotated[
|
| 41 |
-
Path,
|
| 42 |
-
typer.Option(
|
| 43 |
-
...,
|
| 44 |
-
"-o",
|
| 45 |
-
"--output-dir",
|
| 46 |
-
help="The directory where all the models are downloaded.",
|
| 47 |
-
),
|
| 48 |
-
] = (settings.cache_dir / "models"),
|
| 49 |
-
force: Annotated[
|
| 50 |
-
bool, typer.Option(..., help="If true, the download will be forced")
|
| 51 |
-
] = False,
|
| 52 |
-
models: Annotated[
|
| 53 |
-
Optional[list[_AvailableModels]],
|
| 54 |
-
typer.Argument(
|
| 55 |
-
help=f"Models to download (default behavior: all will be downloaded)",
|
| 56 |
-
),
|
| 57 |
-
] = None,
|
| 58 |
-
quiet: Annotated[
|
| 59 |
-
bool,
|
| 60 |
-
typer.Option(
|
| 61 |
-
...,
|
| 62 |
-
"-q",
|
| 63 |
-
"--quiet",
|
| 64 |
-
help="No extra output is generated, the CLI prints only the directory with the cached models.",
|
| 65 |
-
),
|
| 66 |
-
] = False,
|
| 67 |
-
):
|
| 68 |
-
if not quiet:
|
| 69 |
-
FORMAT = "%(message)s"
|
| 70 |
-
logging.basicConfig(
|
| 71 |
-
level=logging.INFO,
|
| 72 |
-
format="[blue]%(message)s[/blue]",
|
| 73 |
-
datefmt="[%X]",
|
| 74 |
-
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
| 75 |
-
)
|
| 76 |
-
to_download = models or [m for m in _AvailableModels]
|
| 77 |
-
output_dir = download_models(
|
| 78 |
-
output_dir=output_dir,
|
| 79 |
-
force=force,
|
| 80 |
-
progress=(not quiet),
|
| 81 |
-
with_layout=_AvailableModels.LAYOUT in to_download,
|
| 82 |
-
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
| 83 |
-
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
| 84 |
-
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
| 85 |
-
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
| 86 |
-
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
if quiet:
|
| 90 |
-
typer.echo(output_dir)
|
| 91 |
-
else:
|
| 92 |
-
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
|
| 93 |
-
|
| 94 |
-
console.print(
|
| 95 |
-
"\n",
|
| 96 |
-
"Docling can now be configured for running offline using the local artifacts.\n\n",
|
| 97 |
-
"Using the CLI:",
|
| 98 |
-
f"`docling --artifacts-path={output_dir} FILE`",
|
| 99 |
-
"\n",
|
| 100 |
-
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
|
| 101 |
-
)
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
click_app = typer.main.get_command(app)
|
| 105 |
-
|
| 106 |
-
if __name__ == "__main__":
|
| 107 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/cli/tools.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
import typer
|
| 2 |
-
|
| 3 |
-
from docling.cli.models import app as models_app
|
| 4 |
-
|
| 5 |
-
app = typer.Typer(
|
| 6 |
-
name="Docling helpers",
|
| 7 |
-
no_args_is_help=True,
|
| 8 |
-
add_completion=False,
|
| 9 |
-
pretty_exceptions_enable=False,
|
| 10 |
-
)
|
| 11 |
-
|
| 12 |
-
app.add_typer(models_app, name="models")
|
| 13 |
-
|
| 14 |
-
click_app = typer.main.get_command(app)
|
| 15 |
-
|
| 16 |
-
if __name__ == "__main__":
|
| 17 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/base_models.py
DELETED
|
@@ -1,258 +0,0 @@
|
|
| 1 |
-
from enum import Enum
|
| 2 |
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
| 3 |
-
|
| 4 |
-
from docling_core.types.doc import (
|
| 5 |
-
BoundingBox,
|
| 6 |
-
DocItemLabel,
|
| 7 |
-
NodeItem,
|
| 8 |
-
PictureDataType,
|
| 9 |
-
Size,
|
| 10 |
-
TableCell,
|
| 11 |
-
)
|
| 12 |
-
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
| 13 |
-
DocumentStream,
|
| 14 |
-
)
|
| 15 |
-
from PIL.Image import Image
|
| 16 |
-
from pydantic import BaseModel, ConfigDict
|
| 17 |
-
|
| 18 |
-
if TYPE_CHECKING:
|
| 19 |
-
from docling.backend.pdf_backend import PdfPageBackend
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class ConversionStatus(str, Enum):
|
| 23 |
-
PENDING = "pending"
|
| 24 |
-
STARTED = "started"
|
| 25 |
-
FAILURE = "failure"
|
| 26 |
-
SUCCESS = "success"
|
| 27 |
-
PARTIAL_SUCCESS = "partial_success"
|
| 28 |
-
SKIPPED = "skipped"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class InputFormat(str, Enum):
|
| 32 |
-
"""A document format supported by document backend parsers."""
|
| 33 |
-
|
| 34 |
-
DOCX = "docx"
|
| 35 |
-
PPTX = "pptx"
|
| 36 |
-
HTML = "html"
|
| 37 |
-
XML_PUBMED = "xml_pubmed"
|
| 38 |
-
IMAGE = "image"
|
| 39 |
-
PDF = "pdf"
|
| 40 |
-
ASCIIDOC = "asciidoc"
|
| 41 |
-
MD = "md"
|
| 42 |
-
XLSX = "xlsx"
|
| 43 |
-
XML_USPTO = "xml_uspto"
|
| 44 |
-
JSON_DOCLING = "json_docling"
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
class OutputFormat(str, Enum):
|
| 48 |
-
MARKDOWN = "md"
|
| 49 |
-
JSON = "json"
|
| 50 |
-
HTML = "html"
|
| 51 |
-
TEXT = "text"
|
| 52 |
-
DOCTAGS = "doctags"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
| 56 |
-
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
| 57 |
-
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
| 58 |
-
InputFormat.PDF: ["pdf"],
|
| 59 |
-
InputFormat.MD: ["md"],
|
| 60 |
-
InputFormat.HTML: ["html", "htm", "xhtml"],
|
| 61 |
-
InputFormat.XML_PUBMED: ["xml", "nxml"],
|
| 62 |
-
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
| 63 |
-
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
| 64 |
-
InputFormat.XLSX: ["xlsx"],
|
| 65 |
-
InputFormat.XML_USPTO: ["xml", "txt"],
|
| 66 |
-
InputFormat.JSON_DOCLING: ["json"],
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
| 70 |
-
InputFormat.DOCX: [
|
| 71 |
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 72 |
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
| 73 |
-
],
|
| 74 |
-
InputFormat.PPTX: [
|
| 75 |
-
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
| 76 |
-
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
| 77 |
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
| 78 |
-
],
|
| 79 |
-
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
| 80 |
-
InputFormat.XML_PUBMED: ["application/xml"],
|
| 81 |
-
InputFormat.IMAGE: [
|
| 82 |
-
"image/png",
|
| 83 |
-
"image/jpeg",
|
| 84 |
-
"image/tiff",
|
| 85 |
-
"image/gif",
|
| 86 |
-
"image/bmp",
|
| 87 |
-
],
|
| 88 |
-
InputFormat.PDF: ["application/pdf"],
|
| 89 |
-
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
| 90 |
-
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
| 91 |
-
InputFormat.XLSX: [
|
| 92 |
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 93 |
-
],
|
| 94 |
-
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
| 95 |
-
InputFormat.JSON_DOCLING: ["application/json"],
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
| 99 |
-
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
| 100 |
-
for value in FormatToMimeType.values()
|
| 101 |
-
for mime in value
|
| 102 |
-
}
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
class DocInputType(str, Enum):
|
| 106 |
-
PATH = "path"
|
| 107 |
-
STREAM = "stream"
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
class DoclingComponentType(str, Enum):
|
| 111 |
-
DOCUMENT_BACKEND = "document_backend"
|
| 112 |
-
MODEL = "model"
|
| 113 |
-
DOC_ASSEMBLER = "doc_assembler"
|
| 114 |
-
USER_INPUT = "user_input"
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
class ErrorItem(BaseModel):
|
| 118 |
-
component_type: DoclingComponentType
|
| 119 |
-
module_name: str
|
| 120 |
-
error_message: str
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
class Cell(BaseModel):
|
| 124 |
-
id: int
|
| 125 |
-
text: str
|
| 126 |
-
bbox: BoundingBox
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
class OcrCell(Cell):
|
| 130 |
-
confidence: float
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
class Cluster(BaseModel):
|
| 134 |
-
id: int
|
| 135 |
-
label: DocItemLabel
|
| 136 |
-
bbox: BoundingBox
|
| 137 |
-
confidence: float = 1.0
|
| 138 |
-
cells: List[Cell] = []
|
| 139 |
-
children: List["Cluster"] = [] # Add child cluster support
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
class BasePageElement(BaseModel):
|
| 143 |
-
label: DocItemLabel
|
| 144 |
-
id: int
|
| 145 |
-
page_no: int
|
| 146 |
-
cluster: Cluster
|
| 147 |
-
text: Optional[str] = None
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
class LayoutPrediction(BaseModel):
|
| 151 |
-
clusters: List[Cluster] = []
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
class ContainerElement(
|
| 155 |
-
BasePageElement
|
| 156 |
-
): # Used for Form and Key-Value-Regions, only for typing.
|
| 157 |
-
pass
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
class Table(BasePageElement):
|
| 161 |
-
otsl_seq: List[str]
|
| 162 |
-
num_rows: int = 0
|
| 163 |
-
num_cols: int = 0
|
| 164 |
-
table_cells: List[TableCell]
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
class TableStructurePrediction(BaseModel):
|
| 168 |
-
table_map: Dict[int, Table] = {}
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
class TextElement(BasePageElement):
|
| 172 |
-
text: str
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
class FigureElement(BasePageElement):
|
| 176 |
-
annotations: List[PictureDataType] = []
|
| 177 |
-
provenance: Optional[str] = None
|
| 178 |
-
predicted_class: Optional[str] = None
|
| 179 |
-
confidence: Optional[float] = None
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
class FigureClassificationPrediction(BaseModel):
|
| 183 |
-
figure_count: int = 0
|
| 184 |
-
figure_map: Dict[int, FigureElement] = {}
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
class EquationPrediction(BaseModel):
|
| 188 |
-
equation_count: int = 0
|
| 189 |
-
equation_map: Dict[int, TextElement] = {}
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
class PagePredictions(BaseModel):
|
| 193 |
-
layout: Optional[LayoutPrediction] = None
|
| 194 |
-
tablestructure: Optional[TableStructurePrediction] = None
|
| 195 |
-
figures_classification: Optional[FigureClassificationPrediction] = None
|
| 196 |
-
equations_prediction: Optional[EquationPrediction] = None
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
class AssembledUnit(BaseModel):
|
| 203 |
-
elements: List[PageElement] = []
|
| 204 |
-
body: List[PageElement] = []
|
| 205 |
-
headers: List[PageElement] = []
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
class ItemAndImageEnrichmentElement(BaseModel):
|
| 209 |
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 210 |
-
|
| 211 |
-
item: NodeItem
|
| 212 |
-
image: Image
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
class Page(BaseModel):
|
| 216 |
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 217 |
-
|
| 218 |
-
page_no: int
|
| 219 |
-
# page_hash: Optional[str] = None
|
| 220 |
-
size: Optional[Size] = None
|
| 221 |
-
cells: List[Cell] = []
|
| 222 |
-
predictions: PagePredictions = PagePredictions()
|
| 223 |
-
assembled: Optional[AssembledUnit] = None
|
| 224 |
-
|
| 225 |
-
_backend: Optional["PdfPageBackend"] = (
|
| 226 |
-
None # Internal PDF backend. By default it is cleared during assembling.
|
| 227 |
-
)
|
| 228 |
-
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
| 229 |
-
_image_cache: Dict[float, Image] = (
|
| 230 |
-
{}
|
| 231 |
-
) # Cache of images in different scales. By default it is cleared during assembling.
|
| 232 |
-
|
| 233 |
-
def get_image(
|
| 234 |
-
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
| 235 |
-
) -> Optional[Image]:
|
| 236 |
-
if self._backend is None:
|
| 237 |
-
return self._image_cache.get(scale, None)
|
| 238 |
-
|
| 239 |
-
if not scale in self._image_cache:
|
| 240 |
-
if cropbox is None:
|
| 241 |
-
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
| 242 |
-
else:
|
| 243 |
-
return self._backend.get_page_image(scale=scale, cropbox=cropbox)
|
| 244 |
-
|
| 245 |
-
if cropbox is None:
|
| 246 |
-
return self._image_cache[scale]
|
| 247 |
-
else:
|
| 248 |
-
page_im = self._image_cache[scale]
|
| 249 |
-
assert self.size is not None
|
| 250 |
-
return page_im.crop(
|
| 251 |
-
cropbox.to_top_left_origin(page_height=self.size.height)
|
| 252 |
-
.scaled(scale=scale)
|
| 253 |
-
.as_tuple()
|
| 254 |
-
)
|
| 255 |
-
|
| 256 |
-
@property
|
| 257 |
-
def image(self) -> Optional[Image]:
|
| 258 |
-
return self.get_image(scale=self._default_image_scale)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/document.py
DELETED
|
@@ -1,394 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import re
|
| 3 |
-
from enum import Enum
|
| 4 |
-
from io import BytesIO
|
| 5 |
-
from pathlib import Path, PurePath
|
| 6 |
-
from typing import (
|
| 7 |
-
TYPE_CHECKING,
|
| 8 |
-
Dict,
|
| 9 |
-
Iterable,
|
| 10 |
-
List,
|
| 11 |
-
Literal,
|
| 12 |
-
Optional,
|
| 13 |
-
Set,
|
| 14 |
-
Type,
|
| 15 |
-
Union,
|
| 16 |
-
)
|
| 17 |
-
|
| 18 |
-
import filetype
|
| 19 |
-
from docling_core.types.doc import (
|
| 20 |
-
DocItem,
|
| 21 |
-
DocItemLabel,
|
| 22 |
-
DoclingDocument,
|
| 23 |
-
PictureItem,
|
| 24 |
-
SectionHeaderItem,
|
| 25 |
-
TableItem,
|
| 26 |
-
TextItem,
|
| 27 |
-
)
|
| 28 |
-
from docling_core.types.doc.document import ListItem
|
| 29 |
-
from docling_core.types.legacy_doc.base import (
|
| 30 |
-
BaseText,
|
| 31 |
-
Figure,
|
| 32 |
-
GlmTableCell,
|
| 33 |
-
PageDimensions,
|
| 34 |
-
PageReference,
|
| 35 |
-
Prov,
|
| 36 |
-
Ref,
|
| 37 |
-
)
|
| 38 |
-
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
| 39 |
-
from docling_core.types.legacy_doc.base import TableCell
|
| 40 |
-
from docling_core.types.legacy_doc.document import (
|
| 41 |
-
CCSDocumentDescription as DsDocumentDescription,
|
| 42 |
-
)
|
| 43 |
-
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
| 44 |
-
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
| 45 |
-
from docling_core.utils.file import resolve_source_to_stream
|
| 46 |
-
from docling_core.utils.legacy import docling_document_to_legacy
|
| 47 |
-
from pydantic import BaseModel
|
| 48 |
-
from typing_extensions import deprecated
|
| 49 |
-
|
| 50 |
-
from docling.backend.abstract_backend import (
|
| 51 |
-
AbstractDocumentBackend,
|
| 52 |
-
PaginatedDocumentBackend,
|
| 53 |
-
)
|
| 54 |
-
from docling.datamodel.base_models import (
|
| 55 |
-
AssembledUnit,
|
| 56 |
-
ConversionStatus,
|
| 57 |
-
DocumentStream,
|
| 58 |
-
ErrorItem,
|
| 59 |
-
FormatToExtensions,
|
| 60 |
-
FormatToMimeType,
|
| 61 |
-
InputFormat,
|
| 62 |
-
MimeTypeToFormat,
|
| 63 |
-
Page,
|
| 64 |
-
)
|
| 65 |
-
from docling.datamodel.settings import DocumentLimits
|
| 66 |
-
from docling.utils.profiling import ProfilingItem
|
| 67 |
-
from docling.utils.utils import create_file_hash, create_hash
|
| 68 |
-
|
| 69 |
-
if TYPE_CHECKING:
|
| 70 |
-
from docling.document_converter import FormatOption
|
| 71 |
-
|
| 72 |
-
_log = logging.getLogger(__name__)
|
| 73 |
-
|
| 74 |
-
layout_label_to_ds_type = {
|
| 75 |
-
DocItemLabel.TITLE: "title",
|
| 76 |
-
DocItemLabel.DOCUMENT_INDEX: "table",
|
| 77 |
-
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
| 78 |
-
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
| 79 |
-
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
| 80 |
-
DocItemLabel.CAPTION: "caption",
|
| 81 |
-
DocItemLabel.PAGE_HEADER: "page-header",
|
| 82 |
-
DocItemLabel.PAGE_FOOTER: "page-footer",
|
| 83 |
-
DocItemLabel.FOOTNOTE: "footnote",
|
| 84 |
-
DocItemLabel.TABLE: "table",
|
| 85 |
-
DocItemLabel.FORMULA: "equation",
|
| 86 |
-
DocItemLabel.LIST_ITEM: "paragraph",
|
| 87 |
-
DocItemLabel.CODE: "paragraph",
|
| 88 |
-
DocItemLabel.PICTURE: "figure",
|
| 89 |
-
DocItemLabel.TEXT: "paragraph",
|
| 90 |
-
DocItemLabel.PARAGRAPH: "paragraph",
|
| 91 |
-
DocItemLabel.FORM: DocItemLabel.FORM.value,
|
| 92 |
-
DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
class InputDocument(BaseModel):
|
| 99 |
-
file: PurePath
|
| 100 |
-
document_hash: str # = None
|
| 101 |
-
valid: bool = True
|
| 102 |
-
limits: DocumentLimits = DocumentLimits()
|
| 103 |
-
format: InputFormat # = None
|
| 104 |
-
|
| 105 |
-
filesize: Optional[int] = None
|
| 106 |
-
page_count: int = 0
|
| 107 |
-
|
| 108 |
-
_backend: AbstractDocumentBackend # Internal PDF backend used
|
| 109 |
-
|
| 110 |
-
def __init__(
|
| 111 |
-
self,
|
| 112 |
-
path_or_stream: Union[BytesIO, Path],
|
| 113 |
-
format: InputFormat,
|
| 114 |
-
backend: Type[AbstractDocumentBackend],
|
| 115 |
-
filename: Optional[str] = None,
|
| 116 |
-
limits: Optional[DocumentLimits] = None,
|
| 117 |
-
):
|
| 118 |
-
super().__init__(
|
| 119 |
-
file="", document_hash="", format=InputFormat.PDF
|
| 120 |
-
) # initialize with dummy values
|
| 121 |
-
|
| 122 |
-
self.limits = limits or DocumentLimits()
|
| 123 |
-
self.format = format
|
| 124 |
-
|
| 125 |
-
try:
|
| 126 |
-
if isinstance(path_or_stream, Path):
|
| 127 |
-
self.file = path_or_stream
|
| 128 |
-
self.filesize = path_or_stream.stat().st_size
|
| 129 |
-
if self.filesize > self.limits.max_file_size:
|
| 130 |
-
self.valid = False
|
| 131 |
-
else:
|
| 132 |
-
self.document_hash = create_file_hash(path_or_stream)
|
| 133 |
-
self._init_doc(backend, path_or_stream)
|
| 134 |
-
|
| 135 |
-
elif isinstance(path_or_stream, BytesIO):
|
| 136 |
-
assert (
|
| 137 |
-
filename is not None
|
| 138 |
-
), "Can't construct InputDocument from stream without providing filename arg."
|
| 139 |
-
self.file = PurePath(filename)
|
| 140 |
-
self.filesize = path_or_stream.getbuffer().nbytes
|
| 141 |
-
|
| 142 |
-
if self.filesize > self.limits.max_file_size:
|
| 143 |
-
self.valid = False
|
| 144 |
-
else:
|
| 145 |
-
self.document_hash = create_file_hash(path_or_stream)
|
| 146 |
-
self._init_doc(backend, path_or_stream)
|
| 147 |
-
else:
|
| 148 |
-
raise RuntimeError(
|
| 149 |
-
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
# For paginated backends, check if the maximum page count is exceeded.
|
| 153 |
-
if self.valid and self._backend.is_valid():
|
| 154 |
-
if self._backend.supports_pagination() and isinstance(
|
| 155 |
-
self._backend, PaginatedDocumentBackend
|
| 156 |
-
):
|
| 157 |
-
self.page_count = self._backend.page_count()
|
| 158 |
-
if not self.page_count <= self.limits.max_num_pages:
|
| 159 |
-
self.valid = False
|
| 160 |
-
elif self.page_count < self.limits.page_range[0]:
|
| 161 |
-
self.valid = False
|
| 162 |
-
|
| 163 |
-
except (FileNotFoundError, OSError) as e:
|
| 164 |
-
self.valid = False
|
| 165 |
-
_log.exception(
|
| 166 |
-
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
| 167 |
-
)
|
| 168 |
-
# raise
|
| 169 |
-
except RuntimeError as e:
|
| 170 |
-
self.valid = False
|
| 171 |
-
_log.exception(
|
| 172 |
-
f"An unexpected error occurred while opening the document {self.file.name}",
|
| 173 |
-
exc_info=e,
|
| 174 |
-
)
|
| 175 |
-
# raise
|
| 176 |
-
|
| 177 |
-
def _init_doc(
|
| 178 |
-
self,
|
| 179 |
-
backend: Type[AbstractDocumentBackend],
|
| 180 |
-
path_or_stream: Union[BytesIO, Path],
|
| 181 |
-
) -> None:
|
| 182 |
-
self._backend = backend(self, path_or_stream=path_or_stream)
|
| 183 |
-
if not self._backend.is_valid():
|
| 184 |
-
self.valid = False
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
class DocumentFormat(str, Enum):
|
| 188 |
-
V2 = "v2"
|
| 189 |
-
V1 = "v1"
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
class ConversionResult(BaseModel):
|
| 193 |
-
input: InputDocument
|
| 194 |
-
|
| 195 |
-
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
| 196 |
-
errors: List[ErrorItem] = [] # structure to keep errors
|
| 197 |
-
|
| 198 |
-
pages: List[Page] = []
|
| 199 |
-
assembled: AssembledUnit = AssembledUnit()
|
| 200 |
-
timings: Dict[str, ProfilingItem] = {}
|
| 201 |
-
|
| 202 |
-
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
| 203 |
-
|
| 204 |
-
@property
|
| 205 |
-
@deprecated("Use document instead.")
|
| 206 |
-
def legacy_document(self):
|
| 207 |
-
return docling_document_to_legacy(self.document)
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
class _DummyBackend(AbstractDocumentBackend):
|
| 211 |
-
def __init__(self, *args, **kwargs):
|
| 212 |
-
super().__init__(*args, **kwargs)
|
| 213 |
-
|
| 214 |
-
def is_valid(self) -> bool:
|
| 215 |
-
return False
|
| 216 |
-
|
| 217 |
-
@classmethod
|
| 218 |
-
def supported_formats(cls) -> Set[InputFormat]:
|
| 219 |
-
return set()
|
| 220 |
-
|
| 221 |
-
@classmethod
|
| 222 |
-
def supports_pagination(cls) -> bool:
|
| 223 |
-
return False
|
| 224 |
-
|
| 225 |
-
def unload(self):
|
| 226 |
-
return super().unload()
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
class _DocumentConversionInput(BaseModel):
|
| 230 |
-
|
| 231 |
-
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
| 232 |
-
headers: Optional[Dict[str, str]] = None
|
| 233 |
-
limits: Optional[DocumentLimits] = DocumentLimits()
|
| 234 |
-
|
| 235 |
-
def docs(
|
| 236 |
-
self, format_options: Dict[InputFormat, "FormatOption"]
|
| 237 |
-
) -> Iterable[InputDocument]:
|
| 238 |
-
for item in self.path_or_stream_iterator:
|
| 239 |
-
obj = (
|
| 240 |
-
resolve_source_to_stream(item, self.headers)
|
| 241 |
-
if isinstance(item, str)
|
| 242 |
-
else item
|
| 243 |
-
)
|
| 244 |
-
format = self._guess_format(obj)
|
| 245 |
-
backend: Type[AbstractDocumentBackend]
|
| 246 |
-
if format not in format_options.keys():
|
| 247 |
-
_log.error(
|
| 248 |
-
f"Input document {obj.name} does not match any allowed format."
|
| 249 |
-
)
|
| 250 |
-
backend = _DummyBackend
|
| 251 |
-
else:
|
| 252 |
-
backend = format_options[format].backend
|
| 253 |
-
|
| 254 |
-
if isinstance(obj, Path):
|
| 255 |
-
yield InputDocument(
|
| 256 |
-
path_or_stream=obj,
|
| 257 |
-
format=format, # type: ignore[arg-type]
|
| 258 |
-
filename=obj.name,
|
| 259 |
-
limits=self.limits,
|
| 260 |
-
backend=backend,
|
| 261 |
-
)
|
| 262 |
-
elif isinstance(obj, DocumentStream):
|
| 263 |
-
yield InputDocument(
|
| 264 |
-
path_or_stream=obj.stream,
|
| 265 |
-
format=format, # type: ignore[arg-type]
|
| 266 |
-
filename=obj.name,
|
| 267 |
-
limits=self.limits,
|
| 268 |
-
backend=backend,
|
| 269 |
-
)
|
| 270 |
-
else:
|
| 271 |
-
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
| 272 |
-
|
| 273 |
-
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
| 274 |
-
content = b"" # empty binary blob
|
| 275 |
-
formats: list[InputFormat] = []
|
| 276 |
-
|
| 277 |
-
if isinstance(obj, Path):
|
| 278 |
-
mime = filetype.guess_mime(str(obj))
|
| 279 |
-
if mime is None:
|
| 280 |
-
ext = obj.suffix[1:]
|
| 281 |
-
mime = _DocumentConversionInput._mime_from_extension(ext)
|
| 282 |
-
if mime is None: # must guess from
|
| 283 |
-
with obj.open("rb") as f:
|
| 284 |
-
content = f.read(1024) # Read first 1KB
|
| 285 |
-
|
| 286 |
-
elif isinstance(obj, DocumentStream):
|
| 287 |
-
content = obj.stream.read(8192)
|
| 288 |
-
obj.stream.seek(0)
|
| 289 |
-
mime = filetype.guess_mime(content)
|
| 290 |
-
if mime is None:
|
| 291 |
-
ext = (
|
| 292 |
-
obj.name.rsplit(".", 1)[-1]
|
| 293 |
-
if ("." in obj.name and not obj.name.startswith("."))
|
| 294 |
-
else ""
|
| 295 |
-
)
|
| 296 |
-
mime = _DocumentConversionInput._mime_from_extension(ext)
|
| 297 |
-
|
| 298 |
-
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
| 299 |
-
mime = mime or "text/plain"
|
| 300 |
-
formats = MimeTypeToFormat.get(mime, [])
|
| 301 |
-
if formats:
|
| 302 |
-
if len(formats) == 1 and mime not in ("text/plain"):
|
| 303 |
-
return formats[0]
|
| 304 |
-
else: # ambiguity in formats
|
| 305 |
-
return _DocumentConversionInput._guess_from_content(
|
| 306 |
-
content, mime, formats
|
| 307 |
-
)
|
| 308 |
-
else:
|
| 309 |
-
return None
|
| 310 |
-
|
| 311 |
-
@staticmethod
|
| 312 |
-
def _guess_from_content(
|
| 313 |
-
content: bytes, mime: str, formats: list[InputFormat]
|
| 314 |
-
) -> Optional[InputFormat]:
|
| 315 |
-
"""Guess the input format of a document by checking part of its content."""
|
| 316 |
-
input_format: Optional[InputFormat] = None
|
| 317 |
-
content_str = content.decode("utf-8")
|
| 318 |
-
|
| 319 |
-
if mime == "application/xml":
|
| 320 |
-
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
| 321 |
-
if match_doctype:
|
| 322 |
-
xml_doctype = match_doctype.group()
|
| 323 |
-
if InputFormat.XML_USPTO in formats and any(
|
| 324 |
-
item in xml_doctype
|
| 325 |
-
for item in (
|
| 326 |
-
"us-patent-application-v4",
|
| 327 |
-
"us-patent-grant-v4",
|
| 328 |
-
"us-grant-025",
|
| 329 |
-
"patent-application-publication",
|
| 330 |
-
)
|
| 331 |
-
):
|
| 332 |
-
input_format = InputFormat.XML_USPTO
|
| 333 |
-
|
| 334 |
-
if (
|
| 335 |
-
InputFormat.XML_PUBMED in formats
|
| 336 |
-
and "/NLM//DTD JATS" in xml_doctype
|
| 337 |
-
):
|
| 338 |
-
input_format = InputFormat.XML_PUBMED
|
| 339 |
-
|
| 340 |
-
elif mime == "text/plain":
|
| 341 |
-
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
| 342 |
-
input_format = InputFormat.XML_USPTO
|
| 343 |
-
|
| 344 |
-
return input_format
|
| 345 |
-
|
| 346 |
-
@staticmethod
|
| 347 |
-
def _mime_from_extension(ext):
|
| 348 |
-
mime = None
|
| 349 |
-
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
| 350 |
-
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
| 351 |
-
elif ext in FormatToExtensions[InputFormat.HTML]:
|
| 352 |
-
mime = FormatToMimeType[InputFormat.HTML][0]
|
| 353 |
-
elif ext in FormatToExtensions[InputFormat.MD]:
|
| 354 |
-
mime = FormatToMimeType[InputFormat.MD][0]
|
| 355 |
-
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
| 356 |
-
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
| 357 |
-
elif ext in FormatToExtensions[InputFormat.PDF]:
|
| 358 |
-
mime = FormatToMimeType[InputFormat.PDF][0]
|
| 359 |
-
return mime
|
| 360 |
-
|
| 361 |
-
@staticmethod
|
| 362 |
-
def _detect_html_xhtml(
|
| 363 |
-
content: bytes,
|
| 364 |
-
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
|
| 365 |
-
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
|
| 366 |
-
|
| 367 |
-
Args:
|
| 368 |
-
content: A short piece of a document from its beginning.
|
| 369 |
-
|
| 370 |
-
Returns:
|
| 371 |
-
The mime type of an XHTML, HTML, or XML file, or None if the content does
|
| 372 |
-
not match any of these formats.
|
| 373 |
-
"""
|
| 374 |
-
content_str = content.decode("ascii", errors="ignore").lower()
|
| 375 |
-
# Remove XML comments
|
| 376 |
-
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
| 377 |
-
content_str = content_str.lstrip()
|
| 378 |
-
|
| 379 |
-
if re.match(r"<\?xml", content_str):
|
| 380 |
-
if "xhtml" in content_str[:1000]:
|
| 381 |
-
return "application/xhtml+xml"
|
| 382 |
-
else:
|
| 383 |
-
return "application/xml"
|
| 384 |
-
|
| 385 |
-
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
| 386 |
-
return "text/html"
|
| 387 |
-
|
| 388 |
-
p = re.compile(
|
| 389 |
-
r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
|
| 390 |
-
)
|
| 391 |
-
if p.search(content_str):
|
| 392 |
-
return "application/xml"
|
| 393 |
-
|
| 394 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/pipeline_options.py
DELETED
|
@@ -1,296 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import os
|
| 3 |
-
from enum import Enum
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
| 6 |
-
|
| 7 |
-
from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
|
| 8 |
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 9 |
-
|
| 10 |
-
_log = logging.getLogger(__name__)
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class AcceleratorDevice(str, Enum):
|
| 14 |
-
"""Devices to run model inference"""
|
| 15 |
-
|
| 16 |
-
AUTO = "auto"
|
| 17 |
-
CPU = "cpu"
|
| 18 |
-
CUDA = "cuda"
|
| 19 |
-
MPS = "mps"
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class AcceleratorOptions(BaseSettings):
|
| 23 |
-
model_config = SettingsConfigDict(
|
| 24 |
-
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
-
num_threads: int = 4
|
| 28 |
-
device: AcceleratorDevice = AcceleratorDevice.AUTO
|
| 29 |
-
|
| 30 |
-
@model_validator(mode="before")
|
| 31 |
-
@classmethod
|
| 32 |
-
def check_alternative_envvars(cls, data: Any) -> Any:
|
| 33 |
-
r"""
|
| 34 |
-
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
| 35 |
-
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
| 36 |
-
|
| 37 |
-
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
| 38 |
-
the same functionality. In case the alias envvar is set and the user tries to override the
|
| 39 |
-
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
| 40 |
-
as an extra input instead of simply overwriting the evvar value for that parameter.
|
| 41 |
-
"""
|
| 42 |
-
if isinstance(data, dict):
|
| 43 |
-
input_num_threads = data.get("num_threads")
|
| 44 |
-
|
| 45 |
-
# Check if to set the num_threads from the alternative envvar
|
| 46 |
-
if input_num_threads is None:
|
| 47 |
-
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
| 48 |
-
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
| 49 |
-
if docling_num_threads is None and omp_num_threads is not None:
|
| 50 |
-
try:
|
| 51 |
-
data["num_threads"] = int(omp_num_threads)
|
| 52 |
-
except ValueError:
|
| 53 |
-
_log.error(
|
| 54 |
-
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
| 55 |
-
omp_num_threads,
|
| 56 |
-
)
|
| 57 |
-
return data
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
class TableFormerMode(str, Enum):
|
| 61 |
-
"""Modes for the TableFormer model."""
|
| 62 |
-
|
| 63 |
-
FAST = "fast"
|
| 64 |
-
ACCURATE = "accurate"
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
class TableStructureOptions(BaseModel):
|
| 68 |
-
"""Options for the table structure."""
|
| 69 |
-
|
| 70 |
-
do_cell_matching: bool = (
|
| 71 |
-
True
|
| 72 |
-
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
| 73 |
-
# are merged across table columns.
|
| 74 |
-
# False: Let table structure model define the text cells, ignore PDF cells.
|
| 75 |
-
)
|
| 76 |
-
mode: TableFormerMode = TableFormerMode.FAST
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
class OcrOptions(BaseModel):
|
| 80 |
-
"""OCR options."""
|
| 81 |
-
|
| 82 |
-
kind: str
|
| 83 |
-
lang: List[str]
|
| 84 |
-
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
| 85 |
-
bitmap_area_threshold: float = (
|
| 86 |
-
0.05 # percentage of the area for a bitmap to processed with OCR
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
class RapidOcrOptions(OcrOptions):
|
| 91 |
-
"""Options for the RapidOCR engine."""
|
| 92 |
-
|
| 93 |
-
kind: Literal["rapidocr"] = "rapidocr"
|
| 94 |
-
|
| 95 |
-
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
| 96 |
-
lang: List[str] = [
|
| 97 |
-
"english",
|
| 98 |
-
"chinese",
|
| 99 |
-
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
|
| 100 |
-
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
| 101 |
-
|
| 102 |
-
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
| 103 |
-
text_score: float = 0.5 # same default as rapidocr
|
| 104 |
-
|
| 105 |
-
use_det: Optional[bool] = None # same default as rapidocr
|
| 106 |
-
use_cls: Optional[bool] = None # same default as rapidocr
|
| 107 |
-
use_rec: Optional[bool] = None # same default as rapidocr
|
| 108 |
-
|
| 109 |
-
# class Device(Enum):
|
| 110 |
-
# CPU = "CPU"
|
| 111 |
-
# CUDA = "CUDA"
|
| 112 |
-
# DIRECTML = "DIRECTML"
|
| 113 |
-
# AUTO = "AUTO"
|
| 114 |
-
|
| 115 |
-
# device: Device = Device.AUTO # Default value is AUTO
|
| 116 |
-
|
| 117 |
-
print_verbose: bool = False # same default as rapidocr
|
| 118 |
-
|
| 119 |
-
det_model_path: Optional[str] = None # same default as rapidocr
|
| 120 |
-
cls_model_path: Optional[str] = None # same default as rapidocr
|
| 121 |
-
rec_model_path: Optional[str] = None # same default as rapidocr
|
| 122 |
-
rec_keys_path: Optional[str] = None # same default as rapidocr
|
| 123 |
-
|
| 124 |
-
model_config = ConfigDict(
|
| 125 |
-
extra="forbid",
|
| 126 |
-
)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
class EasyOcrOptions(OcrOptions):
|
| 130 |
-
"""Options for the EasyOCR engine."""
|
| 131 |
-
|
| 132 |
-
kind: Literal["easyocr"] = "easyocr"
|
| 133 |
-
lang: List[str] = ["fr", "de", "es", "en"]
|
| 134 |
-
|
| 135 |
-
use_gpu: Optional[bool] = None
|
| 136 |
-
|
| 137 |
-
confidence_threshold: float = 0.5
|
| 138 |
-
|
| 139 |
-
model_storage_directory: Optional[str] = None
|
| 140 |
-
recog_network: Optional[str] = "standard"
|
| 141 |
-
download_enabled: bool = True
|
| 142 |
-
|
| 143 |
-
model_config = ConfigDict(
|
| 144 |
-
extra="forbid",
|
| 145 |
-
protected_namespaces=(),
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
class TesseractCliOcrOptions(OcrOptions):
|
| 150 |
-
"""Options for the TesseractCli engine."""
|
| 151 |
-
|
| 152 |
-
kind: Literal["tesseract"] = "tesseract"
|
| 153 |
-
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
| 154 |
-
tesseract_cmd: str = "tesseract"
|
| 155 |
-
path: Optional[str] = None
|
| 156 |
-
|
| 157 |
-
model_config = ConfigDict(
|
| 158 |
-
extra="forbid",
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
class TesseractOcrOptions(OcrOptions):
|
| 163 |
-
"""Options for the Tesseract engine."""
|
| 164 |
-
|
| 165 |
-
kind: Literal["tesserocr"] = "tesserocr"
|
| 166 |
-
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
| 167 |
-
path: Optional[str] = None
|
| 168 |
-
|
| 169 |
-
model_config = ConfigDict(
|
| 170 |
-
extra="forbid",
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
class OcrMacOptions(OcrOptions):
|
| 175 |
-
"""Options for the Mac OCR engine."""
|
| 176 |
-
|
| 177 |
-
kind: Literal["ocrmac"] = "ocrmac"
|
| 178 |
-
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
| 179 |
-
recognition: str = "accurate"
|
| 180 |
-
framework: str = "vision"
|
| 181 |
-
|
| 182 |
-
model_config = ConfigDict(
|
| 183 |
-
extra="forbid",
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
class PictureDescriptionBaseOptions(BaseModel):
|
| 188 |
-
kind: str
|
| 189 |
-
batch_size: int = 8
|
| 190 |
-
scale: float = 2
|
| 191 |
-
|
| 192 |
-
bitmap_area_threshold: float = (
|
| 193 |
-
0.2 # percentage of the area for a bitmap to processed with the models
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
| 198 |
-
kind: Literal["api"] = "api"
|
| 199 |
-
|
| 200 |
-
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
| 201 |
-
headers: Dict[str, str] = {}
|
| 202 |
-
params: Dict[str, Any] = {}
|
| 203 |
-
timeout: float = 20
|
| 204 |
-
|
| 205 |
-
prompt: str = "Describe this image in a few sentences."
|
| 206 |
-
provenance: str = ""
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
| 210 |
-
kind: Literal["vlm"] = "vlm"
|
| 211 |
-
|
| 212 |
-
repo_id: str
|
| 213 |
-
prompt: str = "Describe this image in a few sentences."
|
| 214 |
-
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
| 215 |
-
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
| 216 |
-
|
| 217 |
-
@property
|
| 218 |
-
def repo_cache_folder(self) -> str:
|
| 219 |
-
return self.repo_id.replace("/", "--")
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
| 223 |
-
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
| 224 |
-
)
|
| 225 |
-
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
| 226 |
-
granite_picture_description = PictureDescriptionVlmOptions(
|
| 227 |
-
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
| 228 |
-
prompt="What is shown in this image?",
|
| 229 |
-
)
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
# Define an enum for the backend options
|
| 233 |
-
class PdfBackend(str, Enum):
|
| 234 |
-
"""Enum of valid PDF backends."""
|
| 235 |
-
|
| 236 |
-
PYPDFIUM2 = "pypdfium2"
|
| 237 |
-
DLPARSE_V1 = "dlparse_v1"
|
| 238 |
-
DLPARSE_V2 = "dlparse_v2"
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
# Define an enum for the ocr engines
|
| 242 |
-
class OcrEngine(str, Enum):
|
| 243 |
-
"""Enum of valid OCR engines."""
|
| 244 |
-
|
| 245 |
-
EASYOCR = "easyocr"
|
| 246 |
-
TESSERACT_CLI = "tesseract_cli"
|
| 247 |
-
TESSERACT = "tesseract"
|
| 248 |
-
OCRMAC = "ocrmac"
|
| 249 |
-
RAPIDOCR = "rapidocr"
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
class PipelineOptions(BaseModel):
|
| 253 |
-
"""Base pipeline options."""
|
| 254 |
-
|
| 255 |
-
create_legacy_output: bool = (
|
| 256 |
-
True # This default will be set to False on a future version of docling
|
| 257 |
-
)
|
| 258 |
-
document_timeout: Optional[float] = None
|
| 259 |
-
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
class PdfPipelineOptions(PipelineOptions):
|
| 263 |
-
"""Options for the PDF pipeline."""
|
| 264 |
-
|
| 265 |
-
artifacts_path: Optional[Union[Path, str]] = None
|
| 266 |
-
do_table_structure: bool = True # True: perform table structure extraction
|
| 267 |
-
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
| 268 |
-
do_code_enrichment: bool = False # True: perform code OCR
|
| 269 |
-
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
| 270 |
-
do_picture_classification: bool = False # True: classify pictures in documents
|
| 271 |
-
do_picture_description: bool = False # True: run describe pictures in documents
|
| 272 |
-
|
| 273 |
-
table_structure_options: TableStructureOptions = TableStructureOptions()
|
| 274 |
-
ocr_options: Union[
|
| 275 |
-
EasyOcrOptions,
|
| 276 |
-
TesseractCliOcrOptions,
|
| 277 |
-
TesseractOcrOptions,
|
| 278 |
-
OcrMacOptions,
|
| 279 |
-
RapidOcrOptions,
|
| 280 |
-
] = Field(EasyOcrOptions(), discriminator="kind")
|
| 281 |
-
picture_description_options: Annotated[
|
| 282 |
-
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
| 283 |
-
Field(discriminator="kind"),
|
| 284 |
-
] = smolvlm_picture_description
|
| 285 |
-
|
| 286 |
-
images_scale: float = 1.0
|
| 287 |
-
generate_page_images: bool = False
|
| 288 |
-
generate_picture_images: bool = False
|
| 289 |
-
generate_table_images: bool = Field(
|
| 290 |
-
default=False,
|
| 291 |
-
deprecated=(
|
| 292 |
-
"Field `generate_table_images` is deprecated. "
|
| 293 |
-
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
| 294 |
-
"before conversion and then use the `TableItem.get_image` function."
|
| 295 |
-
),
|
| 296 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/settings.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from typing import Annotated, Tuple
|
| 4 |
-
|
| 5 |
-
from pydantic import BaseModel, PlainValidator
|
| 6 |
-
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
| 10 |
-
if v[0] < 1 or v[1] < v[0]:
|
| 11 |
-
raise ValueError(
|
| 12 |
-
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
| 13 |
-
)
|
| 14 |
-
return v
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
| 18 |
-
|
| 19 |
-
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class DocumentLimits(BaseModel):
|
| 23 |
-
max_num_pages: int = sys.maxsize
|
| 24 |
-
max_file_size: int = sys.maxsize
|
| 25 |
-
page_range: PageRange = DEFAULT_PAGE_RANGE
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
class BatchConcurrencySettings(BaseModel):
|
| 29 |
-
doc_batch_size: int = 2
|
| 30 |
-
doc_batch_concurrency: int = 2
|
| 31 |
-
page_batch_size: int = 4
|
| 32 |
-
page_batch_concurrency: int = 2
|
| 33 |
-
elements_batch_size: int = 16
|
| 34 |
-
|
| 35 |
-
# doc_batch_size: int = 1
|
| 36 |
-
# doc_batch_concurrency: int = 1
|
| 37 |
-
# page_batch_size: int = 1
|
| 38 |
-
# page_batch_concurrency: int = 1
|
| 39 |
-
|
| 40 |
-
# model_concurrency: int = 2
|
| 41 |
-
|
| 42 |
-
# To force models into single core: export OMP_NUM_THREADS=1
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
class DebugSettings(BaseModel):
|
| 46 |
-
visualize_cells: bool = False
|
| 47 |
-
visualize_ocr: bool = False
|
| 48 |
-
visualize_layout: bool = False
|
| 49 |
-
visualize_raw_layout: bool = False
|
| 50 |
-
visualize_tables: bool = False
|
| 51 |
-
|
| 52 |
-
profile_pipeline_timings: bool = False
|
| 53 |
-
|
| 54 |
-
# Path used to output debug information.
|
| 55 |
-
debug_output_path: str = str(Path.cwd() / "debug")
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
class AppSettings(BaseSettings):
|
| 59 |
-
model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
|
| 60 |
-
|
| 61 |
-
perf: BatchConcurrencySettings
|
| 62 |
-
debug: DebugSettings
|
| 63 |
-
|
| 64 |
-
cache_dir: Path = Path.home() / ".cache" / "docling"
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/document_converter.py
DELETED
|
@@ -1,348 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import math
|
| 3 |
-
import sys
|
| 4 |
-
import time
|
| 5 |
-
from functools import partial
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
| 8 |
-
|
| 9 |
-
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
| 10 |
-
|
| 11 |
-
from docling.backend.abstract_backend import AbstractDocumentBackend
|
| 12 |
-
from docling.backend.asciidoc_backend import AsciiDocBackend
|
| 13 |
-
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
| 14 |
-
from docling.backend.html_backend import HTMLDocumentBackend
|
| 15 |
-
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
| 16 |
-
from docling.backend.md_backend import MarkdownDocumentBackend
|
| 17 |
-
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
| 18 |
-
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
| 19 |
-
from docling.backend.msword_backend import MsWordDocumentBackend
|
| 20 |
-
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
| 21 |
-
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
| 22 |
-
from docling.datamodel.base_models import (
|
| 23 |
-
ConversionStatus,
|
| 24 |
-
DoclingComponentType,
|
| 25 |
-
DocumentStream,
|
| 26 |
-
ErrorItem,
|
| 27 |
-
InputFormat,
|
| 28 |
-
)
|
| 29 |
-
from docling.datamodel.document import (
|
| 30 |
-
ConversionResult,
|
| 31 |
-
InputDocument,
|
| 32 |
-
_DocumentConversionInput,
|
| 33 |
-
)
|
| 34 |
-
from docling.datamodel.pipeline_options import PipelineOptions
|
| 35 |
-
from docling.datamodel.settings import (
|
| 36 |
-
DEFAULT_PAGE_RANGE,
|
| 37 |
-
DocumentLimits,
|
| 38 |
-
PageRange,
|
| 39 |
-
settings,
|
| 40 |
-
)
|
| 41 |
-
from docling.exceptions import ConversionError
|
| 42 |
-
from docling.pipeline.base_pipeline import BasePipeline
|
| 43 |
-
from docling.pipeline.simple_pipeline import SimplePipeline
|
| 44 |
-
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
| 45 |
-
from docling.utils.utils import chunkify
|
| 46 |
-
|
| 47 |
-
_log = logging.getLogger(__name__)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
class FormatOption(BaseModel):
|
| 51 |
-
pipeline_cls: Type[BasePipeline]
|
| 52 |
-
pipeline_options: Optional[PipelineOptions] = None
|
| 53 |
-
backend: Type[AbstractDocumentBackend]
|
| 54 |
-
|
| 55 |
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 56 |
-
|
| 57 |
-
@model_validator(mode="after")
|
| 58 |
-
def set_optional_field_default(self) -> "FormatOption":
|
| 59 |
-
if self.pipeline_options is None:
|
| 60 |
-
self.pipeline_options = self.pipeline_cls.get_default_options()
|
| 61 |
-
return self
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
class ExcelFormatOption(FormatOption):
|
| 65 |
-
pipeline_cls: Type = SimplePipeline
|
| 66 |
-
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
class WordFormatOption(FormatOption):
|
| 70 |
-
pipeline_cls: Type = SimplePipeline
|
| 71 |
-
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
class PowerpointFormatOption(FormatOption):
|
| 75 |
-
pipeline_cls: Type = SimplePipeline
|
| 76 |
-
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
class MarkdownFormatOption(FormatOption):
|
| 80 |
-
pipeline_cls: Type = SimplePipeline
|
| 81 |
-
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
class AsciiDocFormatOption(FormatOption):
|
| 85 |
-
pipeline_cls: Type = SimplePipeline
|
| 86 |
-
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
class HTMLFormatOption(FormatOption):
|
| 90 |
-
pipeline_cls: Type = SimplePipeline
|
| 91 |
-
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
class PatentUsptoFormatOption(FormatOption):
|
| 95 |
-
pipeline_cls: Type = SimplePipeline
|
| 96 |
-
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
class XMLPubMedFormatOption(FormatOption):
|
| 100 |
-
pipeline_cls: Type = SimplePipeline
|
| 101 |
-
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
class ImageFormatOption(FormatOption):
|
| 105 |
-
pipeline_cls: Type = StandardPdfPipeline
|
| 106 |
-
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
class PdfFormatOption(FormatOption):
|
| 110 |
-
pipeline_cls: Type = StandardPdfPipeline
|
| 111 |
-
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def _get_default_option(format: InputFormat) -> FormatOption:
|
| 115 |
-
format_to_default_options = {
|
| 116 |
-
InputFormat.XLSX: FormatOption(
|
| 117 |
-
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
| 118 |
-
),
|
| 119 |
-
InputFormat.DOCX: FormatOption(
|
| 120 |
-
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
| 121 |
-
),
|
| 122 |
-
InputFormat.PPTX: FormatOption(
|
| 123 |
-
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
| 124 |
-
),
|
| 125 |
-
InputFormat.MD: FormatOption(
|
| 126 |
-
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
| 127 |
-
),
|
| 128 |
-
InputFormat.ASCIIDOC: FormatOption(
|
| 129 |
-
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
| 130 |
-
),
|
| 131 |
-
InputFormat.HTML: FormatOption(
|
| 132 |
-
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
| 133 |
-
),
|
| 134 |
-
InputFormat.XML_USPTO: FormatOption(
|
| 135 |
-
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
| 136 |
-
),
|
| 137 |
-
InputFormat.XML_PUBMED: FormatOption(
|
| 138 |
-
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
|
| 139 |
-
),
|
| 140 |
-
InputFormat.IMAGE: FormatOption(
|
| 141 |
-
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
| 142 |
-
),
|
| 143 |
-
InputFormat.PDF: FormatOption(
|
| 144 |
-
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
| 145 |
-
),
|
| 146 |
-
InputFormat.JSON_DOCLING: FormatOption(
|
| 147 |
-
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
| 148 |
-
),
|
| 149 |
-
}
|
| 150 |
-
if (options := format_to_default_options.get(format)) is not None:
|
| 151 |
-
return options
|
| 152 |
-
else:
|
| 153 |
-
raise RuntimeError(f"No default options configured for {format}")
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
class DocumentConverter:
|
| 157 |
-
_default_download_filename = "file"
|
| 158 |
-
|
| 159 |
-
def __init__(
|
| 160 |
-
self,
|
| 161 |
-
allowed_formats: Optional[List[InputFormat]] = None,
|
| 162 |
-
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
| 163 |
-
):
|
| 164 |
-
self.allowed_formats = (
|
| 165 |
-
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
| 166 |
-
)
|
| 167 |
-
self.format_to_options = {
|
| 168 |
-
format: (
|
| 169 |
-
_get_default_option(format=format)
|
| 170 |
-
if (custom_option := (format_options or {}).get(format)) is None
|
| 171 |
-
else custom_option
|
| 172 |
-
)
|
| 173 |
-
for format in self.allowed_formats
|
| 174 |
-
}
|
| 175 |
-
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
| 176 |
-
|
| 177 |
-
def initialize_pipeline(self, format: InputFormat):
|
| 178 |
-
"""Initialize the conversion pipeline for the selected format."""
|
| 179 |
-
pipeline = self._get_pipeline(doc_format=format)
|
| 180 |
-
if pipeline is None:
|
| 181 |
-
raise ConversionError(
|
| 182 |
-
f"No pipeline could be initialized for format {format}"
|
| 183 |
-
)
|
| 184 |
-
|
| 185 |
-
@validate_call(config=ConfigDict(strict=True))
|
| 186 |
-
def convert(
|
| 187 |
-
self,
|
| 188 |
-
source: Union[Path, str, DocumentStream], # TODO review naming
|
| 189 |
-
headers: Optional[Dict[str, str]] = None,
|
| 190 |
-
raises_on_error: bool = True,
|
| 191 |
-
max_num_pages: int = sys.maxsize,
|
| 192 |
-
max_file_size: int = sys.maxsize,
|
| 193 |
-
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
| 194 |
-
) -> ConversionResult:
|
| 195 |
-
all_res = self.convert_all(
|
| 196 |
-
source=[source],
|
| 197 |
-
raises_on_error=raises_on_error,
|
| 198 |
-
max_num_pages=max_num_pages,
|
| 199 |
-
max_file_size=max_file_size,
|
| 200 |
-
headers=headers,
|
| 201 |
-
page_range=page_range,
|
| 202 |
-
)
|
| 203 |
-
return next(all_res)
|
| 204 |
-
|
| 205 |
-
@validate_call(config=ConfigDict(strict=True))
|
| 206 |
-
def convert_all(
|
| 207 |
-
self,
|
| 208 |
-
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
| 209 |
-
headers: Optional[Dict[str, str]] = None,
|
| 210 |
-
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
| 211 |
-
max_num_pages: int = sys.maxsize,
|
| 212 |
-
max_file_size: int = sys.maxsize,
|
| 213 |
-
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
| 214 |
-
) -> Iterator[ConversionResult]:
|
| 215 |
-
limits = DocumentLimits(
|
| 216 |
-
max_num_pages=max_num_pages,
|
| 217 |
-
max_file_size=max_file_size,
|
| 218 |
-
page_range=page_range,
|
| 219 |
-
)
|
| 220 |
-
conv_input = _DocumentConversionInput(
|
| 221 |
-
path_or_stream_iterator=source, limits=limits, headers=headers
|
| 222 |
-
)
|
| 223 |
-
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
| 224 |
-
|
| 225 |
-
had_result = False
|
| 226 |
-
for conv_res in conv_res_iter:
|
| 227 |
-
had_result = True
|
| 228 |
-
if raises_on_error and conv_res.status not in {
|
| 229 |
-
ConversionStatus.SUCCESS,
|
| 230 |
-
ConversionStatus.PARTIAL_SUCCESS,
|
| 231 |
-
}:
|
| 232 |
-
raise ConversionError(
|
| 233 |
-
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
| 234 |
-
)
|
| 235 |
-
else:
|
| 236 |
-
yield conv_res
|
| 237 |
-
|
| 238 |
-
if not had_result and raises_on_error:
|
| 239 |
-
raise ConversionError(
|
| 240 |
-
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
def _convert(
|
| 244 |
-
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
| 245 |
-
) -> Iterator[ConversionResult]:
|
| 246 |
-
start_time = time.monotonic()
|
| 247 |
-
|
| 248 |
-
for input_batch in chunkify(
|
| 249 |
-
conv_input.docs(self.format_to_options),
|
| 250 |
-
settings.perf.doc_batch_size, # pass format_options
|
| 251 |
-
):
|
| 252 |
-
_log.info(f"Going to convert document batch...")
|
| 253 |
-
|
| 254 |
-
# parallel processing only within input_batch
|
| 255 |
-
# with ThreadPoolExecutor(
|
| 256 |
-
# max_workers=settings.perf.doc_batch_concurrency
|
| 257 |
-
# ) as pool:
|
| 258 |
-
# yield from pool.map(self.process_document, input_batch)
|
| 259 |
-
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
| 260 |
-
|
| 261 |
-
for item in map(
|
| 262 |
-
partial(self._process_document, raises_on_error=raises_on_error),
|
| 263 |
-
input_batch,
|
| 264 |
-
):
|
| 265 |
-
elapsed = time.monotonic() - start_time
|
| 266 |
-
start_time = time.monotonic()
|
| 267 |
-
_log.info(
|
| 268 |
-
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
| 269 |
-
)
|
| 270 |
-
yield item
|
| 271 |
-
|
| 272 |
-
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
| 273 |
-
fopt = self.format_to_options.get(doc_format)
|
| 274 |
-
|
| 275 |
-
if fopt is None:
|
| 276 |
-
return None
|
| 277 |
-
else:
|
| 278 |
-
pipeline_class = fopt.pipeline_cls
|
| 279 |
-
pipeline_options = fopt.pipeline_options
|
| 280 |
-
|
| 281 |
-
if pipeline_options is None:
|
| 282 |
-
return None
|
| 283 |
-
# TODO this will ignore if different options have been defined for the same pipeline class.
|
| 284 |
-
if (
|
| 285 |
-
pipeline_class not in self.initialized_pipelines
|
| 286 |
-
or self.initialized_pipelines[pipeline_class].pipeline_options
|
| 287 |
-
!= pipeline_options
|
| 288 |
-
):
|
| 289 |
-
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
| 290 |
-
pipeline_options=pipeline_options
|
| 291 |
-
)
|
| 292 |
-
return self.initialized_pipelines[pipeline_class]
|
| 293 |
-
|
| 294 |
-
def _process_document(
|
| 295 |
-
self, in_doc: InputDocument, raises_on_error: bool
|
| 296 |
-
) -> ConversionResult:
|
| 297 |
-
|
| 298 |
-
valid = (
|
| 299 |
-
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
| 300 |
-
)
|
| 301 |
-
if valid:
|
| 302 |
-
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
| 303 |
-
else:
|
| 304 |
-
error_message = f"File format not allowed: {in_doc.file}"
|
| 305 |
-
if raises_on_error:
|
| 306 |
-
raise ConversionError(error_message)
|
| 307 |
-
else:
|
| 308 |
-
error_item = ErrorItem(
|
| 309 |
-
component_type=DoclingComponentType.USER_INPUT,
|
| 310 |
-
module_name="",
|
| 311 |
-
error_message=error_message,
|
| 312 |
-
)
|
| 313 |
-
conv_res = ConversionResult(
|
| 314 |
-
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
| 315 |
-
)
|
| 316 |
-
|
| 317 |
-
return conv_res
|
| 318 |
-
|
| 319 |
-
def _execute_pipeline(
|
| 320 |
-
self, in_doc: InputDocument, raises_on_error: bool
|
| 321 |
-
) -> ConversionResult:
|
| 322 |
-
if in_doc.valid:
|
| 323 |
-
pipeline = self._get_pipeline(in_doc.format)
|
| 324 |
-
if pipeline is not None:
|
| 325 |
-
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
| 326 |
-
else:
|
| 327 |
-
if raises_on_error:
|
| 328 |
-
raise ConversionError(
|
| 329 |
-
f"No pipeline could be initialized for {in_doc.file}."
|
| 330 |
-
)
|
| 331 |
-
else:
|
| 332 |
-
conv_res = ConversionResult(
|
| 333 |
-
input=in_doc,
|
| 334 |
-
status=ConversionStatus.FAILURE,
|
| 335 |
-
)
|
| 336 |
-
else:
|
| 337 |
-
if raises_on_error:
|
| 338 |
-
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
| 339 |
-
|
| 340 |
-
else:
|
| 341 |
-
# invalid doc or not of desired format
|
| 342 |
-
conv_res = ConversionResult(
|
| 343 |
-
input=in_doc,
|
| 344 |
-
status=ConversionStatus.FAILURE,
|
| 345 |
-
)
|
| 346 |
-
# TODO add error log why it failed.
|
| 347 |
-
|
| 348 |
-
return conv_res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/exceptions.py
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
class BaseError(RuntimeError):
|
| 2 |
-
pass
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
class ConversionError(BaseError):
|
| 6 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/__init__.py
DELETED
|
File without changes
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/base_model.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
| 1 |
-
from abc import ABC, abstractmethod
|
| 2 |
-
from typing import Any, Generic, Iterable, Optional
|
| 3 |
-
|
| 4 |
-
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
| 5 |
-
from typing_extensions import TypeVar
|
| 6 |
-
|
| 7 |
-
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
| 8 |
-
from docling.datamodel.document import ConversionResult
|
| 9 |
-
from docling.datamodel.settings import settings
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
class BasePageModel(ABC):
|
| 13 |
-
@abstractmethod
|
| 14 |
-
def __call__(
|
| 15 |
-
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
| 16 |
-
) -> Iterable[Page]:
|
| 17 |
-
pass
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
| 24 |
-
|
| 25 |
-
elements_batch_size: int = settings.perf.elements_batch_size
|
| 26 |
-
|
| 27 |
-
@abstractmethod
|
| 28 |
-
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
| 29 |
-
pass
|
| 30 |
-
|
| 31 |
-
@abstractmethod
|
| 32 |
-
def prepare_element(
|
| 33 |
-
self, conv_res: ConversionResult, element: NodeItem
|
| 34 |
-
) -> Optional[EnrichElementT]:
|
| 35 |
-
pass
|
| 36 |
-
|
| 37 |
-
@abstractmethod
|
| 38 |
-
def __call__(
|
| 39 |
-
self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
|
| 40 |
-
) -> Iterable[NodeItem]:
|
| 41 |
-
pass
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
| 45 |
-
|
| 46 |
-
def prepare_element(
|
| 47 |
-
self, conv_res: ConversionResult, element: NodeItem
|
| 48 |
-
) -> Optional[NodeItem]:
|
| 49 |
-
if self.is_processable(doc=conv_res.document, element=element):
|
| 50 |
-
return element
|
| 51 |
-
return None
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class BaseItemAndImageEnrichmentModel(
|
| 55 |
-
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
| 56 |
-
):
|
| 57 |
-
|
| 58 |
-
images_scale: float
|
| 59 |
-
expansion_factor: float = 0.0
|
| 60 |
-
|
| 61 |
-
def prepare_element(
|
| 62 |
-
self, conv_res: ConversionResult, element: NodeItem
|
| 63 |
-
) -> Optional[ItemAndImageEnrichmentElement]:
|
| 64 |
-
if not self.is_processable(doc=conv_res.document, element=element):
|
| 65 |
-
return None
|
| 66 |
-
|
| 67 |
-
assert isinstance(element, DocItem)
|
| 68 |
-
element_prov = element.prov[0]
|
| 69 |
-
|
| 70 |
-
bbox = element_prov.bbox
|
| 71 |
-
width = bbox.r - bbox.l
|
| 72 |
-
height = bbox.t - bbox.b
|
| 73 |
-
|
| 74 |
-
# TODO: move to a utility in the BoundingBox class
|
| 75 |
-
expanded_bbox = BoundingBox(
|
| 76 |
-
l=bbox.l - width * self.expansion_factor,
|
| 77 |
-
t=bbox.t + height * self.expansion_factor,
|
| 78 |
-
r=bbox.r + width * self.expansion_factor,
|
| 79 |
-
b=bbox.b - height * self.expansion_factor,
|
| 80 |
-
coord_origin=bbox.coord_origin,
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
page_ix = element_prov.page_no - 1
|
| 84 |
-
cropped_image = conv_res.pages[page_ix].get_image(
|
| 85 |
-
scale=self.images_scale, cropbox=expanded_bbox
|
| 86 |
-
)
|
| 87 |
-
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/base_ocr_model.py
DELETED
|
@@ -1,189 +0,0 @@
|
|
| 1 |
-
import copy
|
| 2 |
-
import logging
|
| 3 |
-
from abc import abstractmethod
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Iterable, List
|
| 6 |
-
|
| 7 |
-
import numpy as np
|
| 8 |
-
from docling_core.types.doc import BoundingBox, CoordOrigin
|
| 9 |
-
from PIL import Image, ImageDraw
|
| 10 |
-
from rtree import index
|
| 11 |
-
from scipy.ndimage import binary_dilation, find_objects, label
|
| 12 |
-
|
| 13 |
-
from docling.datamodel.base_models import Cell, OcrCell, Page
|
| 14 |
-
from docling.datamodel.document import ConversionResult
|
| 15 |
-
from docling.datamodel.pipeline_options import OcrOptions
|
| 16 |
-
from docling.datamodel.settings import settings
|
| 17 |
-
from docling.models.base_model import BasePageModel
|
| 18 |
-
|
| 19 |
-
_log = logging.getLogger(__name__)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class BaseOcrModel(BasePageModel):
|
| 23 |
-
def __init__(self, enabled: bool, options: OcrOptions):
|
| 24 |
-
self.enabled = enabled
|
| 25 |
-
self.options = options
|
| 26 |
-
|
| 27 |
-
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
| 28 |
-
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
| 29 |
-
BITMAP_COVERAGE_TRESHOLD = 0.75
|
| 30 |
-
assert page.size is not None
|
| 31 |
-
|
| 32 |
-
def find_ocr_rects(size, bitmap_rects):
|
| 33 |
-
image = Image.new(
|
| 34 |
-
"1", (round(size.width), round(size.height))
|
| 35 |
-
) # '1' mode is binary
|
| 36 |
-
|
| 37 |
-
# Draw all bitmap rects into a binary image
|
| 38 |
-
draw = ImageDraw.Draw(image)
|
| 39 |
-
for rect in bitmap_rects:
|
| 40 |
-
x0, y0, x1, y1 = rect.as_tuple()
|
| 41 |
-
x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
|
| 42 |
-
draw.rectangle([(x0, y0), (x1, y1)], fill=1)
|
| 43 |
-
|
| 44 |
-
np_image = np.array(image)
|
| 45 |
-
|
| 46 |
-
# Dilate the image by 10 pixels to merge nearby bitmap rectangles
|
| 47 |
-
structure = np.ones(
|
| 48 |
-
(20, 20)
|
| 49 |
-
) # Create a 20x20 structure element (10 pixels in all directions)
|
| 50 |
-
np_image = binary_dilation(np_image > 0, structure=structure)
|
| 51 |
-
|
| 52 |
-
# Find the connected components
|
| 53 |
-
labeled_image, num_features = label(
|
| 54 |
-
np_image > 0
|
| 55 |
-
) # Label black (0 value) regions
|
| 56 |
-
|
| 57 |
-
# Find enclosing bounding boxes for each connected component.
|
| 58 |
-
slices = find_objects(labeled_image)
|
| 59 |
-
bounding_boxes = [
|
| 60 |
-
BoundingBox(
|
| 61 |
-
l=slc[1].start,
|
| 62 |
-
t=slc[0].start,
|
| 63 |
-
r=slc[1].stop - 1,
|
| 64 |
-
b=slc[0].stop - 1,
|
| 65 |
-
coord_origin=CoordOrigin.TOPLEFT,
|
| 66 |
-
)
|
| 67 |
-
for slc in slices
|
| 68 |
-
]
|
| 69 |
-
|
| 70 |
-
# Compute area fraction on page covered by bitmaps
|
| 71 |
-
area_frac = np.sum(np_image > 0) / (size.width * size.height)
|
| 72 |
-
|
| 73 |
-
return (area_frac, bounding_boxes) # fraction covered # boxes
|
| 74 |
-
|
| 75 |
-
if page._backend is not None:
|
| 76 |
-
bitmap_rects = page._backend.get_bitmap_rects()
|
| 77 |
-
else:
|
| 78 |
-
bitmap_rects = []
|
| 79 |
-
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
| 80 |
-
|
| 81 |
-
# return full-page rectangle if page is dominantly covered with bitmaps
|
| 82 |
-
if self.options.force_full_page_ocr or coverage > max(
|
| 83 |
-
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
| 84 |
-
):
|
| 85 |
-
return [
|
| 86 |
-
BoundingBox(
|
| 87 |
-
l=0,
|
| 88 |
-
t=0,
|
| 89 |
-
r=page.size.width,
|
| 90 |
-
b=page.size.height,
|
| 91 |
-
coord_origin=CoordOrigin.TOPLEFT,
|
| 92 |
-
)
|
| 93 |
-
]
|
| 94 |
-
# return individual rectangles if the bitmap coverage is above the threshold
|
| 95 |
-
elif coverage > self.options.bitmap_area_threshold:
|
| 96 |
-
return ocr_rects
|
| 97 |
-
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
|
| 98 |
-
return []
|
| 99 |
-
|
| 100 |
-
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
| 101 |
-
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
| 102 |
-
# Create R-tree index for programmatic cells
|
| 103 |
-
p = index.Property()
|
| 104 |
-
p.dimension = 2
|
| 105 |
-
idx = index.Index(properties=p)
|
| 106 |
-
for i, cell in enumerate(programmatic_cells):
|
| 107 |
-
idx.insert(i, cell.bbox.as_tuple())
|
| 108 |
-
|
| 109 |
-
def is_overlapping_with_existing_cells(ocr_cell):
|
| 110 |
-
# Query the R-tree to get overlapping rectangles
|
| 111 |
-
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
|
| 112 |
-
|
| 113 |
-
return (
|
| 114 |
-
len(possible_matches_index) > 0
|
| 115 |
-
) # this is a weak criterion but it works.
|
| 116 |
-
|
| 117 |
-
filtered_ocr_cells = [
|
| 118 |
-
rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
|
| 119 |
-
]
|
| 120 |
-
return filtered_ocr_cells
|
| 121 |
-
|
| 122 |
-
def post_process_cells(self, ocr_cells, programmatic_cells):
|
| 123 |
-
r"""
|
| 124 |
-
Post-process the ocr and programmatic cells and return the final list of of cells
|
| 125 |
-
"""
|
| 126 |
-
if self.options.force_full_page_ocr:
|
| 127 |
-
# If a full page OCR is forced, use only the OCR cells
|
| 128 |
-
cells = [
|
| 129 |
-
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
| 130 |
-
for c_ocr in ocr_cells
|
| 131 |
-
]
|
| 132 |
-
return cells
|
| 133 |
-
|
| 134 |
-
## Remove OCR cells which overlap with programmatic cells.
|
| 135 |
-
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
| 136 |
-
programmatic_cells.extend(filtered_ocr_cells)
|
| 137 |
-
return programmatic_cells
|
| 138 |
-
|
| 139 |
-
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
| 140 |
-
image = copy.deepcopy(page.image)
|
| 141 |
-
scale_x = image.width / page.size.width
|
| 142 |
-
scale_y = image.height / page.size.height
|
| 143 |
-
|
| 144 |
-
draw = ImageDraw.Draw(image, "RGBA")
|
| 145 |
-
|
| 146 |
-
# Draw OCR rectangles as yellow filled rect
|
| 147 |
-
for rect in ocr_rects:
|
| 148 |
-
x0, y0, x1, y1 = rect.as_tuple()
|
| 149 |
-
y0 *= scale_x
|
| 150 |
-
y1 *= scale_y
|
| 151 |
-
x0 *= scale_x
|
| 152 |
-
x1 *= scale_x
|
| 153 |
-
|
| 154 |
-
shade_color = (255, 255, 0, 40) # transparent yellow
|
| 155 |
-
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
| 156 |
-
|
| 157 |
-
# Draw OCR and programmatic cells
|
| 158 |
-
for tc in page.cells:
|
| 159 |
-
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
| 160 |
-
y0 *= scale_x
|
| 161 |
-
y1 *= scale_y
|
| 162 |
-
x0 *= scale_x
|
| 163 |
-
x1 *= scale_x
|
| 164 |
-
|
| 165 |
-
if y1 <= y0:
|
| 166 |
-
y1, y0 = y0, y1
|
| 167 |
-
|
| 168 |
-
color = "gray"
|
| 169 |
-
if isinstance(tc, OcrCell):
|
| 170 |
-
color = "magenta"
|
| 171 |
-
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
| 172 |
-
|
| 173 |
-
if show:
|
| 174 |
-
image.show()
|
| 175 |
-
else:
|
| 176 |
-
out_path: Path = (
|
| 177 |
-
Path(settings.debug.debug_output_path)
|
| 178 |
-
/ f"debug_{conv_res.input.file.stem}"
|
| 179 |
-
)
|
| 180 |
-
out_path.mkdir(parents=True, exist_ok=True)
|
| 181 |
-
|
| 182 |
-
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
| 183 |
-
image.save(str(out_file), format="png")
|
| 184 |
-
|
| 185 |
-
@abstractmethod
|
| 186 |
-
def __call__(
|
| 187 |
-
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
| 188 |
-
) -> Iterable[Page]:
|
| 189 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/code_formula_model.py
DELETED
|
@@ -1,251 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
| 4 |
-
|
| 5 |
-
import numpy as np
|
| 6 |
-
from docling_core.types.doc import (
|
| 7 |
-
CodeItem,
|
| 8 |
-
DocItemLabel,
|
| 9 |
-
DoclingDocument,
|
| 10 |
-
NodeItem,
|
| 11 |
-
TextItem,
|
| 12 |
-
)
|
| 13 |
-
from docling_core.types.doc.labels import CodeLanguageLabel
|
| 14 |
-
from PIL import Image
|
| 15 |
-
from pydantic import BaseModel
|
| 16 |
-
|
| 17 |
-
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
| 18 |
-
from docling.datamodel.pipeline_options import AcceleratorOptions
|
| 19 |
-
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
| 20 |
-
from docling.utils.accelerator_utils import decide_device
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
class CodeFormulaModelOptions(BaseModel):
|
| 24 |
-
"""
|
| 25 |
-
Configuration options for the CodeFormulaModel.
|
| 26 |
-
|
| 27 |
-
Attributes
|
| 28 |
-
----------
|
| 29 |
-
kind : str
|
| 30 |
-
Type of the model. Fixed value "code_formula".
|
| 31 |
-
do_code_enrichment : bool
|
| 32 |
-
True if code enrichment is enabled, False otherwise.
|
| 33 |
-
do_formula_enrichment : bool
|
| 34 |
-
True if formula enrichment is enabled, False otherwise.
|
| 35 |
-
"""
|
| 36 |
-
|
| 37 |
-
kind: Literal["code_formula"] = "code_formula"
|
| 38 |
-
do_code_enrichment: bool = True
|
| 39 |
-
do_formula_enrichment: bool = True
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
| 43 |
-
"""
|
| 44 |
-
Model for processing and enriching documents with code and formula predictions.
|
| 45 |
-
|
| 46 |
-
Attributes
|
| 47 |
-
----------
|
| 48 |
-
enabled : bool
|
| 49 |
-
True if the model is enabled, False otherwise.
|
| 50 |
-
options : CodeFormulaModelOptions
|
| 51 |
-
Configuration options for the CodeFormulaModel.
|
| 52 |
-
code_formula_model : CodeFormulaPredictor
|
| 53 |
-
The predictor model for code and formula processing.
|
| 54 |
-
|
| 55 |
-
Methods
|
| 56 |
-
-------
|
| 57 |
-
__init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
|
| 58 |
-
Initializes the CodeFormulaModel with the given configuration options.
|
| 59 |
-
is_processable(self, doc, element)
|
| 60 |
-
Determines if a given element in a document can be processed by the model.
|
| 61 |
-
__call__(self, doc, element_batch)
|
| 62 |
-
Processes the given batch of elements and enriches them with predictions.
|
| 63 |
-
"""
|
| 64 |
-
|
| 65 |
-
_model_repo_folder = "ds4sd--CodeFormula"
|
| 66 |
-
elements_batch_size = 5
|
| 67 |
-
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
| 68 |
-
expansion_factor = 0.03
|
| 69 |
-
|
| 70 |
-
def __init__(
|
| 71 |
-
self,
|
| 72 |
-
enabled: bool,
|
| 73 |
-
artifacts_path: Optional[Path],
|
| 74 |
-
options: CodeFormulaModelOptions,
|
| 75 |
-
accelerator_options: AcceleratorOptions,
|
| 76 |
-
):
|
| 77 |
-
"""
|
| 78 |
-
Initializes the CodeFormulaModel with the given configuration.
|
| 79 |
-
|
| 80 |
-
Parameters
|
| 81 |
-
----------
|
| 82 |
-
enabled : bool
|
| 83 |
-
True if the model is enabled, False otherwise.
|
| 84 |
-
artifacts_path : Path
|
| 85 |
-
Path to the directory containing the model artifacts.
|
| 86 |
-
options : CodeFormulaModelOptions
|
| 87 |
-
Configuration options for the model.
|
| 88 |
-
accelerator_options : AcceleratorOptions
|
| 89 |
-
Options specifying the device and number of threads for acceleration.
|
| 90 |
-
"""
|
| 91 |
-
self.enabled = enabled
|
| 92 |
-
self.options = options
|
| 93 |
-
|
| 94 |
-
if self.enabled:
|
| 95 |
-
device = decide_device(accelerator_options.device)
|
| 96 |
-
|
| 97 |
-
from docling_ibm_models.code_formula_model.code_formula_predictor import (
|
| 98 |
-
CodeFormulaPredictor,
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
-
if artifacts_path is None:
|
| 102 |
-
artifacts_path = self.download_models()
|
| 103 |
-
else:
|
| 104 |
-
artifacts_path = artifacts_path / self._model_repo_folder
|
| 105 |
-
|
| 106 |
-
self.code_formula_model = CodeFormulaPredictor(
|
| 107 |
-
artifacts_path=str(artifacts_path),
|
| 108 |
-
device=device,
|
| 109 |
-
num_threads=accelerator_options.num_threads,
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
@staticmethod
|
| 113 |
-
def download_models(
|
| 114 |
-
local_dir: Optional[Path] = None,
|
| 115 |
-
force: bool = False,
|
| 116 |
-
progress: bool = False,
|
| 117 |
-
) -> Path:
|
| 118 |
-
from huggingface_hub import snapshot_download
|
| 119 |
-
from huggingface_hub.utils import disable_progress_bars
|
| 120 |
-
|
| 121 |
-
if not progress:
|
| 122 |
-
disable_progress_bars()
|
| 123 |
-
download_path = snapshot_download(
|
| 124 |
-
repo_id="ds4sd/CodeFormula",
|
| 125 |
-
force_download=force,
|
| 126 |
-
local_dir=local_dir,
|
| 127 |
-
revision="v1.0.1",
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
return Path(download_path)
|
| 131 |
-
|
| 132 |
-
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
| 133 |
-
"""
|
| 134 |
-
Determines if a given element in a document can be processed by the model.
|
| 135 |
-
|
| 136 |
-
Parameters
|
| 137 |
-
----------
|
| 138 |
-
doc : DoclingDocument
|
| 139 |
-
The document being processed.
|
| 140 |
-
element : NodeItem
|
| 141 |
-
The element within the document to check.
|
| 142 |
-
|
| 143 |
-
Returns
|
| 144 |
-
-------
|
| 145 |
-
bool
|
| 146 |
-
True if the element can be processed, False otherwise.
|
| 147 |
-
"""
|
| 148 |
-
return self.enabled and (
|
| 149 |
-
(isinstance(element, CodeItem) and self.options.do_code_enrichment)
|
| 150 |
-
or (
|
| 151 |
-
isinstance(element, TextItem)
|
| 152 |
-
and element.label == DocItemLabel.FORMULA
|
| 153 |
-
and self.options.do_formula_enrichment
|
| 154 |
-
)
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
|
| 158 |
-
"""Extracts a programming language from the beginning of a string.
|
| 159 |
-
|
| 160 |
-
This function checks if the input string starts with a pattern of the form
|
| 161 |
-
``<_some_language_>``. If it does, it extracts the language string and returns
|
| 162 |
-
a tuple of (remainder, language). Otherwise, it returns the original string
|
| 163 |
-
and `None`.
|
| 164 |
-
|
| 165 |
-
Args:
|
| 166 |
-
input_string (str): The input string, which may start with ``<_language_>``.
|
| 167 |
-
|
| 168 |
-
Returns:
|
| 169 |
-
Tuple[str, Optional[str]]:
|
| 170 |
-
A tuple where:
|
| 171 |
-
- The first element is either:
|
| 172 |
-
- The remainder of the string (everything after ``<_language_>``),
|
| 173 |
-
if a match is found; or
|
| 174 |
-
- The original string, if no match is found.
|
| 175 |
-
- The second element is the extracted language if a match is found;
|
| 176 |
-
otherwise, `None`.
|
| 177 |
-
"""
|
| 178 |
-
pattern = r"^<_([^>]+)_>\s*(.*)"
|
| 179 |
-
match = re.match(pattern, input_string, flags=re.DOTALL)
|
| 180 |
-
if match:
|
| 181 |
-
language = str(match.group(1)) # the captured programming language
|
| 182 |
-
remainder = str(match.group(2)) # everything after the <_language_>
|
| 183 |
-
return remainder, language
|
| 184 |
-
else:
|
| 185 |
-
return input_string, None
|
| 186 |
-
|
| 187 |
-
def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
|
| 188 |
-
"""
|
| 189 |
-
Converts a string to a corresponding `CodeLanguageLabel` enum member.
|
| 190 |
-
|
| 191 |
-
If the provided string does not match any value in `CodeLanguageLabel`,
|
| 192 |
-
it defaults to `CodeLanguageLabel.UNKNOWN`.
|
| 193 |
-
|
| 194 |
-
Args:
|
| 195 |
-
value (Optional[str]): The string representation of the code language or None.
|
| 196 |
-
|
| 197 |
-
Returns:
|
| 198 |
-
CodeLanguageLabel: The corresponding enum member if the value is valid,
|
| 199 |
-
otherwise `CodeLanguageLabel.UNKNOWN`.
|
| 200 |
-
"""
|
| 201 |
-
if not isinstance(value, str):
|
| 202 |
-
return CodeLanguageLabel.UNKNOWN
|
| 203 |
-
|
| 204 |
-
try:
|
| 205 |
-
return CodeLanguageLabel(value)
|
| 206 |
-
except ValueError:
|
| 207 |
-
return CodeLanguageLabel.UNKNOWN
|
| 208 |
-
|
| 209 |
-
def __call__(
|
| 210 |
-
self,
|
| 211 |
-
doc: DoclingDocument,
|
| 212 |
-
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
| 213 |
-
) -> Iterable[NodeItem]:
|
| 214 |
-
"""
|
| 215 |
-
Processes the given batch of elements and enriches them with predictions.
|
| 216 |
-
|
| 217 |
-
Parameters
|
| 218 |
-
----------
|
| 219 |
-
doc : DoclingDocument
|
| 220 |
-
The document being processed.
|
| 221 |
-
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
| 222 |
-
A batch of elements to be processed.
|
| 223 |
-
|
| 224 |
-
Returns
|
| 225 |
-
-------
|
| 226 |
-
Iterable[Any]
|
| 227 |
-
An iterable of enriched elements.
|
| 228 |
-
"""
|
| 229 |
-
if not self.enabled:
|
| 230 |
-
for element in element_batch:
|
| 231 |
-
yield element.item
|
| 232 |
-
return
|
| 233 |
-
|
| 234 |
-
labels: List[str] = []
|
| 235 |
-
images: List[Union[Image.Image, np.ndarray]] = []
|
| 236 |
-
elements: List[TextItem] = []
|
| 237 |
-
for el in element_batch:
|
| 238 |
-
assert isinstance(el.item, TextItem)
|
| 239 |
-
elements.append(el.item)
|
| 240 |
-
labels.append(el.item.label)
|
| 241 |
-
images.append(el.image)
|
| 242 |
-
|
| 243 |
-
outputs = self.code_formula_model.predict(images, labels)
|
| 244 |
-
|
| 245 |
-
for item, output in zip(elements, outputs):
|
| 246 |
-
if isinstance(item, CodeItem):
|
| 247 |
-
output, code_language = self._extract_code_language(output)
|
| 248 |
-
item.code_language = self._get_code_language_enum(code_language)
|
| 249 |
-
item.text = output
|
| 250 |
-
|
| 251 |
-
yield item
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/document_picture_classifier.py
DELETED
|
@@ -1,190 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
| 3 |
-
|
| 4 |
-
import numpy as np
|
| 5 |
-
from docling_core.types.doc import (
|
| 6 |
-
DoclingDocument,
|
| 7 |
-
NodeItem,
|
| 8 |
-
PictureClassificationClass,
|
| 9 |
-
PictureClassificationData,
|
| 10 |
-
PictureItem,
|
| 11 |
-
)
|
| 12 |
-
from PIL import Image
|
| 13 |
-
from pydantic import BaseModel
|
| 14 |
-
|
| 15 |
-
from docling.datamodel.pipeline_options import AcceleratorOptions
|
| 16 |
-
from docling.models.base_model import BaseEnrichmentModel
|
| 17 |
-
from docling.utils.accelerator_utils import decide_device
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class DocumentPictureClassifierOptions(BaseModel):
|
| 21 |
-
"""
|
| 22 |
-
Options for configuring the DocumentPictureClassifier.
|
| 23 |
-
|
| 24 |
-
Attributes
|
| 25 |
-
----------
|
| 26 |
-
kind : Literal["document_picture_classifier"]
|
| 27 |
-
Identifier for the type of classifier.
|
| 28 |
-
"""
|
| 29 |
-
|
| 30 |
-
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class DocumentPictureClassifier(BaseEnrichmentModel):
|
| 34 |
-
"""
|
| 35 |
-
A model for classifying pictures in documents.
|
| 36 |
-
|
| 37 |
-
This class enriches document pictures with predicted classifications
|
| 38 |
-
based on a predefined set of classes.
|
| 39 |
-
|
| 40 |
-
Attributes
|
| 41 |
-
----------
|
| 42 |
-
enabled : bool
|
| 43 |
-
Whether the classifier is enabled for use.
|
| 44 |
-
options : DocumentPictureClassifierOptions
|
| 45 |
-
Configuration options for the classifier.
|
| 46 |
-
document_picture_classifier : DocumentPictureClassifierPredictor
|
| 47 |
-
The underlying prediction model, loaded if the classifier is enabled.
|
| 48 |
-
|
| 49 |
-
Methods
|
| 50 |
-
-------
|
| 51 |
-
__init__(enabled, artifacts_path, options, accelerator_options)
|
| 52 |
-
Initializes the classifier with specified configurations.
|
| 53 |
-
is_processable(doc, element)
|
| 54 |
-
Checks if the given element can be processed by the classifier.
|
| 55 |
-
__call__(doc, element_batch)
|
| 56 |
-
Processes a batch of elements and adds classification annotations.
|
| 57 |
-
"""
|
| 58 |
-
|
| 59 |
-
_model_repo_folder = "ds4sd--DocumentFigureClassifier"
|
| 60 |
-
images_scale = 2
|
| 61 |
-
|
| 62 |
-
def __init__(
|
| 63 |
-
self,
|
| 64 |
-
enabled: bool,
|
| 65 |
-
artifacts_path: Optional[Path],
|
| 66 |
-
options: DocumentPictureClassifierOptions,
|
| 67 |
-
accelerator_options: AcceleratorOptions,
|
| 68 |
-
):
|
| 69 |
-
"""
|
| 70 |
-
Initializes the DocumentPictureClassifier.
|
| 71 |
-
|
| 72 |
-
Parameters
|
| 73 |
-
----------
|
| 74 |
-
enabled : bool
|
| 75 |
-
Indicates whether the classifier is enabled.
|
| 76 |
-
artifacts_path : Optional[Union[Path, str]],
|
| 77 |
-
Path to the directory containing model artifacts.
|
| 78 |
-
options : DocumentPictureClassifierOptions
|
| 79 |
-
Configuration options for the classifier.
|
| 80 |
-
accelerator_options : AcceleratorOptions
|
| 81 |
-
Options for configuring the device and parallelism.
|
| 82 |
-
"""
|
| 83 |
-
self.enabled = enabled
|
| 84 |
-
self.options = options
|
| 85 |
-
|
| 86 |
-
if self.enabled:
|
| 87 |
-
device = decide_device(accelerator_options.device)
|
| 88 |
-
from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
|
| 89 |
-
DocumentFigureClassifierPredictor,
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
if artifacts_path is None:
|
| 93 |
-
artifacts_path = self.download_models()
|
| 94 |
-
else:
|
| 95 |
-
artifacts_path = artifacts_path / self._model_repo_folder
|
| 96 |
-
|
| 97 |
-
self.document_picture_classifier = DocumentFigureClassifierPredictor(
|
| 98 |
-
artifacts_path=str(artifacts_path),
|
| 99 |
-
device=device,
|
| 100 |
-
num_threads=accelerator_options.num_threads,
|
| 101 |
-
)
|
| 102 |
-
|
| 103 |
-
@staticmethod
|
| 104 |
-
def download_models(
|
| 105 |
-
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
| 106 |
-
) -> Path:
|
| 107 |
-
from huggingface_hub import snapshot_download
|
| 108 |
-
from huggingface_hub.utils import disable_progress_bars
|
| 109 |
-
|
| 110 |
-
if not progress:
|
| 111 |
-
disable_progress_bars()
|
| 112 |
-
download_path = snapshot_download(
|
| 113 |
-
repo_id="ds4sd/DocumentFigureClassifier",
|
| 114 |
-
force_download=force,
|
| 115 |
-
local_dir=local_dir,
|
| 116 |
-
revision="v1.0.0",
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
return Path(download_path)
|
| 120 |
-
|
| 121 |
-
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
| 122 |
-
"""
|
| 123 |
-
Determines if the given element can be processed by the classifier.
|
| 124 |
-
|
| 125 |
-
Parameters
|
| 126 |
-
----------
|
| 127 |
-
doc : DoclingDocument
|
| 128 |
-
The document containing the element.
|
| 129 |
-
element : NodeItem
|
| 130 |
-
The element to be checked.
|
| 131 |
-
|
| 132 |
-
Returns
|
| 133 |
-
-------
|
| 134 |
-
bool
|
| 135 |
-
True if the element is a PictureItem and processing is enabled; False otherwise.
|
| 136 |
-
"""
|
| 137 |
-
return self.enabled and isinstance(element, PictureItem)
|
| 138 |
-
|
| 139 |
-
def __call__(
|
| 140 |
-
self,
|
| 141 |
-
doc: DoclingDocument,
|
| 142 |
-
element_batch: Iterable[NodeItem],
|
| 143 |
-
) -> Iterable[NodeItem]:
|
| 144 |
-
"""
|
| 145 |
-
Processes a batch of elements and enriches them with classification predictions.
|
| 146 |
-
|
| 147 |
-
Parameters
|
| 148 |
-
----------
|
| 149 |
-
doc : DoclingDocument
|
| 150 |
-
The document containing the elements to be processed.
|
| 151 |
-
element_batch : Iterable[NodeItem]
|
| 152 |
-
A batch of pictures to classify.
|
| 153 |
-
|
| 154 |
-
Returns
|
| 155 |
-
-------
|
| 156 |
-
Iterable[NodeItem]
|
| 157 |
-
An iterable of NodeItem objects after processing. The field
|
| 158 |
-
'data.classification' is added containing the classification for each picture.
|
| 159 |
-
"""
|
| 160 |
-
if not self.enabled:
|
| 161 |
-
for element in element_batch:
|
| 162 |
-
yield element
|
| 163 |
-
return
|
| 164 |
-
|
| 165 |
-
images: List[Union[Image.Image, np.ndarray]] = []
|
| 166 |
-
elements: List[PictureItem] = []
|
| 167 |
-
for el in element_batch:
|
| 168 |
-
assert isinstance(el, PictureItem)
|
| 169 |
-
elements.append(el)
|
| 170 |
-
img = el.get_image(doc)
|
| 171 |
-
assert img is not None
|
| 172 |
-
images.append(img)
|
| 173 |
-
|
| 174 |
-
outputs = self.document_picture_classifier.predict(images)
|
| 175 |
-
|
| 176 |
-
for element, output in zip(elements, outputs):
|
| 177 |
-
element.annotations.append(
|
| 178 |
-
PictureClassificationData(
|
| 179 |
-
provenance="DocumentPictureClassifier",
|
| 180 |
-
predicted_classes=[
|
| 181 |
-
PictureClassificationClass(
|
| 182 |
-
class_name=pred[0],
|
| 183 |
-
confidence=pred[1],
|
| 184 |
-
)
|
| 185 |
-
for pred in output
|
| 186 |
-
],
|
| 187 |
-
)
|
| 188 |
-
)
|
| 189 |
-
|
| 190 |
-
yield element
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/ds_glm_model.py
DELETED
|
@@ -1,386 +0,0 @@
|
|
| 1 |
-
import copy
|
| 2 |
-
import random
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import List, Union
|
| 5 |
-
|
| 6 |
-
from deepsearch_glm.andromeda_nlp import nlp_model
|
| 7 |
-
from docling_core.types.doc import (
|
| 8 |
-
BoundingBox,
|
| 9 |
-
CoordOrigin,
|
| 10 |
-
DocItemLabel,
|
| 11 |
-
DoclingDocument,
|
| 12 |
-
)
|
| 13 |
-
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
| 14 |
-
from docling_core.types.legacy_doc.base import (
|
| 15 |
-
Figure,
|
| 16 |
-
PageDimensions,
|
| 17 |
-
PageReference,
|
| 18 |
-
Prov,
|
| 19 |
-
Ref,
|
| 20 |
-
)
|
| 21 |
-
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
| 22 |
-
from docling_core.types.legacy_doc.base import TableCell
|
| 23 |
-
from docling_core.types.legacy_doc.document import BaseText
|
| 24 |
-
from docling_core.types.legacy_doc.document import (
|
| 25 |
-
CCSDocumentDescription as DsDocumentDescription,
|
| 26 |
-
)
|
| 27 |
-
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
| 28 |
-
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
| 29 |
-
from PIL import ImageDraw
|
| 30 |
-
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
| 31 |
-
|
| 32 |
-
from docling.datamodel.base_models import (
|
| 33 |
-
Cluster,
|
| 34 |
-
ContainerElement,
|
| 35 |
-
FigureElement,
|
| 36 |
-
Table,
|
| 37 |
-
TextElement,
|
| 38 |
-
)
|
| 39 |
-
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
| 40 |
-
from docling.datamodel.settings import settings
|
| 41 |
-
from docling.utils.glm_utils import to_docling_document
|
| 42 |
-
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
| 43 |
-
from docling.utils.utils import create_hash
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
class GlmOptions(BaseModel):
|
| 47 |
-
model_config = ConfigDict(protected_namespaces=())
|
| 48 |
-
|
| 49 |
-
model_names: str = "" # e.g. "language;term;reference"
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
class GlmModel:
|
| 53 |
-
def __init__(self, options: GlmOptions):
|
| 54 |
-
self.options = options
|
| 55 |
-
|
| 56 |
-
self.model = nlp_model(loglevel="error", text_ordering=True)
|
| 57 |
-
|
| 58 |
-
def _to_legacy_document(self, conv_res) -> DsDocument:
|
| 59 |
-
title = ""
|
| 60 |
-
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
|
| 61 |
-
|
| 62 |
-
page_hashes = [
|
| 63 |
-
PageReference(
|
| 64 |
-
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
|
| 65 |
-
page=p.page_no + 1,
|
| 66 |
-
model="default",
|
| 67 |
-
)
|
| 68 |
-
for p in conv_res.pages
|
| 69 |
-
]
|
| 70 |
-
|
| 71 |
-
file_info = DsFileInfoObject(
|
| 72 |
-
filename=conv_res.input.file.name,
|
| 73 |
-
document_hash=conv_res.input.document_hash,
|
| 74 |
-
num_pages=conv_res.input.page_count,
|
| 75 |
-
page_hashes=page_hashes,
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
main_text: List[Union[Ref, BaseText]] = []
|
| 79 |
-
page_headers: List[Union[Ref, BaseText]] = []
|
| 80 |
-
page_footers: List[Union[Ref, BaseText]] = []
|
| 81 |
-
|
| 82 |
-
tables: List[DsSchemaTable] = []
|
| 83 |
-
figures: List[Figure] = []
|
| 84 |
-
|
| 85 |
-
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
| 86 |
-
|
| 87 |
-
for element in conv_res.assembled.body:
|
| 88 |
-
# Convert bboxes to lower-left origin.
|
| 89 |
-
target_bbox = DsBoundingBox(
|
| 90 |
-
element.cluster.bbox.to_bottom_left_origin(
|
| 91 |
-
page_no_to_page[element.page_no].size.height
|
| 92 |
-
).as_tuple()
|
| 93 |
-
)
|
| 94 |
-
|
| 95 |
-
if isinstance(element, TextElement):
|
| 96 |
-
main_text.append(
|
| 97 |
-
BaseText(
|
| 98 |
-
text=element.text,
|
| 99 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 100 |
-
name=element.label,
|
| 101 |
-
prov=[
|
| 102 |
-
Prov(
|
| 103 |
-
bbox=target_bbox,
|
| 104 |
-
page=element.page_no + 1,
|
| 105 |
-
span=[0, len(element.text)],
|
| 106 |
-
)
|
| 107 |
-
],
|
| 108 |
-
)
|
| 109 |
-
)
|
| 110 |
-
elif isinstance(element, Table):
|
| 111 |
-
index = len(tables)
|
| 112 |
-
ref_str = f"#/tables/{index}"
|
| 113 |
-
main_text.append(
|
| 114 |
-
Ref(
|
| 115 |
-
name=element.label,
|
| 116 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 117 |
-
ref=ref_str,
|
| 118 |
-
),
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
# Initialise empty table data grid (only empty cells)
|
| 122 |
-
table_data = [
|
| 123 |
-
[
|
| 124 |
-
TableCell(
|
| 125 |
-
text="",
|
| 126 |
-
# bbox=[0,0,0,0],
|
| 127 |
-
spans=[[i, j]],
|
| 128 |
-
obj_type="body",
|
| 129 |
-
)
|
| 130 |
-
for j in range(element.num_cols)
|
| 131 |
-
]
|
| 132 |
-
for i in range(element.num_rows)
|
| 133 |
-
]
|
| 134 |
-
|
| 135 |
-
# Overwrite cells in table data for which there is actual cell content.
|
| 136 |
-
for cell in element.table_cells:
|
| 137 |
-
for i in range(
|
| 138 |
-
min(cell.start_row_offset_idx, element.num_rows),
|
| 139 |
-
min(cell.end_row_offset_idx, element.num_rows),
|
| 140 |
-
):
|
| 141 |
-
for j in range(
|
| 142 |
-
min(cell.start_col_offset_idx, element.num_cols),
|
| 143 |
-
min(cell.end_col_offset_idx, element.num_cols),
|
| 144 |
-
):
|
| 145 |
-
celltype = "body"
|
| 146 |
-
if cell.column_header:
|
| 147 |
-
celltype = "col_header"
|
| 148 |
-
elif cell.row_header:
|
| 149 |
-
celltype = "row_header"
|
| 150 |
-
elif cell.row_section:
|
| 151 |
-
celltype = "row_section"
|
| 152 |
-
|
| 153 |
-
def make_spans(cell):
|
| 154 |
-
for rspan in range(
|
| 155 |
-
min(cell.start_row_offset_idx, element.num_rows),
|
| 156 |
-
min(cell.end_row_offset_idx, element.num_rows),
|
| 157 |
-
):
|
| 158 |
-
for cspan in range(
|
| 159 |
-
min(
|
| 160 |
-
cell.start_col_offset_idx, element.num_cols
|
| 161 |
-
),
|
| 162 |
-
min(cell.end_col_offset_idx, element.num_cols),
|
| 163 |
-
):
|
| 164 |
-
yield [rspan, cspan]
|
| 165 |
-
|
| 166 |
-
spans = list(make_spans(cell))
|
| 167 |
-
if cell.bbox is not None:
|
| 168 |
-
bbox = cell.bbox.to_bottom_left_origin(
|
| 169 |
-
page_no_to_page[element.page_no].size.height
|
| 170 |
-
).as_tuple()
|
| 171 |
-
else:
|
| 172 |
-
bbox = None
|
| 173 |
-
|
| 174 |
-
table_data[i][j] = TableCell(
|
| 175 |
-
text=cell.text,
|
| 176 |
-
bbox=bbox,
|
| 177 |
-
# col=j,
|
| 178 |
-
# row=i,
|
| 179 |
-
spans=spans,
|
| 180 |
-
obj_type=celltype,
|
| 181 |
-
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
| 182 |
-
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
| 183 |
-
)
|
| 184 |
-
|
| 185 |
-
tables.append(
|
| 186 |
-
DsSchemaTable(
|
| 187 |
-
num_cols=element.num_cols,
|
| 188 |
-
num_rows=element.num_rows,
|
| 189 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 190 |
-
data=table_data,
|
| 191 |
-
prov=[
|
| 192 |
-
Prov(
|
| 193 |
-
bbox=target_bbox,
|
| 194 |
-
page=element.page_no + 1,
|
| 195 |
-
span=[0, 0],
|
| 196 |
-
)
|
| 197 |
-
],
|
| 198 |
-
)
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
elif isinstance(element, FigureElement):
|
| 202 |
-
index = len(figures)
|
| 203 |
-
ref_str = f"#/figures/{index}"
|
| 204 |
-
main_text.append(
|
| 205 |
-
Ref(
|
| 206 |
-
name=element.label,
|
| 207 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 208 |
-
ref=ref_str,
|
| 209 |
-
),
|
| 210 |
-
)
|
| 211 |
-
figures.append(
|
| 212 |
-
Figure(
|
| 213 |
-
prov=[
|
| 214 |
-
Prov(
|
| 215 |
-
bbox=target_bbox,
|
| 216 |
-
page=element.page_no + 1,
|
| 217 |
-
span=[0, 0],
|
| 218 |
-
)
|
| 219 |
-
],
|
| 220 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 221 |
-
payload={
|
| 222 |
-
"children": TypeAdapter(List[Cluster]).dump_python(
|
| 223 |
-
element.cluster.children
|
| 224 |
-
)
|
| 225 |
-
}, # hack to channel child clusters through GLM
|
| 226 |
-
)
|
| 227 |
-
)
|
| 228 |
-
elif isinstance(element, ContainerElement):
|
| 229 |
-
main_text.append(
|
| 230 |
-
BaseText(
|
| 231 |
-
text="",
|
| 232 |
-
payload={
|
| 233 |
-
"children": TypeAdapter(List[Cluster]).dump_python(
|
| 234 |
-
element.cluster.children
|
| 235 |
-
)
|
| 236 |
-
}, # hack to channel child clusters through GLM
|
| 237 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 238 |
-
name=element.label,
|
| 239 |
-
prov=[
|
| 240 |
-
Prov(
|
| 241 |
-
bbox=target_bbox,
|
| 242 |
-
page=element.page_no + 1,
|
| 243 |
-
span=[0, 0],
|
| 244 |
-
)
|
| 245 |
-
],
|
| 246 |
-
)
|
| 247 |
-
)
|
| 248 |
-
|
| 249 |
-
# We can throw in headers and footers at the end of the legacy doc
|
| 250 |
-
# since the reading-order will re-sort it later.
|
| 251 |
-
for element in conv_res.assembled.headers:
|
| 252 |
-
# Convert bboxes to lower-left origin.
|
| 253 |
-
target_bbox = DsBoundingBox(
|
| 254 |
-
element.cluster.bbox.to_bottom_left_origin(
|
| 255 |
-
page_no_to_page[element.page_no].size.height
|
| 256 |
-
).as_tuple()
|
| 257 |
-
)
|
| 258 |
-
|
| 259 |
-
if isinstance(element, TextElement):
|
| 260 |
-
|
| 261 |
-
tel = BaseText(
|
| 262 |
-
text=element.text,
|
| 263 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 264 |
-
name=element.label,
|
| 265 |
-
prov=[
|
| 266 |
-
Prov(
|
| 267 |
-
bbox=target_bbox,
|
| 268 |
-
page=element.page_no + 1,
|
| 269 |
-
span=[0, len(element.text)],
|
| 270 |
-
)
|
| 271 |
-
],
|
| 272 |
-
)
|
| 273 |
-
if element.label == DocItemLabel.PAGE_HEADER:
|
| 274 |
-
index = len(page_headers)
|
| 275 |
-
ref_str = f"#/page-headers/{index}"
|
| 276 |
-
main_text.append(
|
| 277 |
-
Ref(
|
| 278 |
-
name=element.label,
|
| 279 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 280 |
-
ref=ref_str,
|
| 281 |
-
),
|
| 282 |
-
)
|
| 283 |
-
page_headers.append(tel)
|
| 284 |
-
elif element.label == DocItemLabel.PAGE_FOOTER:
|
| 285 |
-
index = len(page_footers)
|
| 286 |
-
ref_str = f"#/page-footers/{index}"
|
| 287 |
-
main_text.append(
|
| 288 |
-
Ref(
|
| 289 |
-
name=element.label,
|
| 290 |
-
obj_type=layout_label_to_ds_type.get(element.label),
|
| 291 |
-
ref=ref_str,
|
| 292 |
-
),
|
| 293 |
-
)
|
| 294 |
-
page_footers.append(tel)
|
| 295 |
-
|
| 296 |
-
page_dimensions = [
|
| 297 |
-
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
| 298 |
-
for p in conv_res.pages
|
| 299 |
-
if p.size is not None
|
| 300 |
-
]
|
| 301 |
-
|
| 302 |
-
ds_doc: DsDocument = DsDocument(
|
| 303 |
-
name=title,
|
| 304 |
-
description=desc,
|
| 305 |
-
file_info=file_info,
|
| 306 |
-
main_text=main_text,
|
| 307 |
-
tables=tables,
|
| 308 |
-
figures=figures,
|
| 309 |
-
page_dimensions=page_dimensions,
|
| 310 |
-
page_headers=page_headers,
|
| 311 |
-
page_footers=page_footers,
|
| 312 |
-
)
|
| 313 |
-
|
| 314 |
-
return ds_doc
|
| 315 |
-
|
| 316 |
-
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
| 317 |
-
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
| 318 |
-
ds_doc = self._to_legacy_document(conv_res)
|
| 319 |
-
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
|
| 320 |
-
|
| 321 |
-
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
| 322 |
-
|
| 323 |
-
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
| 324 |
-
1 == 1
|
| 325 |
-
|
| 326 |
-
# DEBUG code:
|
| 327 |
-
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
| 328 |
-
clusters_to_draw = []
|
| 329 |
-
image = copy.deepcopy(conv_res.pages[page_no].image)
|
| 330 |
-
for ix, elem in enumerate(ds_document.main_text):
|
| 331 |
-
if isinstance(elem, BaseText):
|
| 332 |
-
prov = elem.prov[0] # type: ignore
|
| 333 |
-
elif isinstance(elem, Ref):
|
| 334 |
-
_, arr, index = elem.ref.split("/")
|
| 335 |
-
index = int(index) # type: ignore
|
| 336 |
-
if arr == "tables":
|
| 337 |
-
prov = ds_document.tables[index].prov[0]
|
| 338 |
-
elif arr == "figures":
|
| 339 |
-
prov = ds_document.pictures[index].prov[0]
|
| 340 |
-
else:
|
| 341 |
-
prov = None
|
| 342 |
-
|
| 343 |
-
if prov and prov.page == page_no:
|
| 344 |
-
clusters_to_draw.append(
|
| 345 |
-
Cluster(
|
| 346 |
-
id=ix,
|
| 347 |
-
label=elem.name,
|
| 348 |
-
bbox=BoundingBox.from_tuple(
|
| 349 |
-
coord=prov.bbox, # type: ignore
|
| 350 |
-
origin=CoordOrigin.BOTTOMLEFT,
|
| 351 |
-
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
| 352 |
-
)
|
| 353 |
-
)
|
| 354 |
-
|
| 355 |
-
draw = ImageDraw.Draw(image)
|
| 356 |
-
for c in clusters_to_draw:
|
| 357 |
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
| 358 |
-
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
| 359 |
-
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
|
| 360 |
-
|
| 361 |
-
cell_color = (
|
| 362 |
-
random.randint(30, 140),
|
| 363 |
-
random.randint(30, 140),
|
| 364 |
-
random.randint(30, 140),
|
| 365 |
-
)
|
| 366 |
-
for tc in c.cells: # [:1]:
|
| 367 |
-
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
| 368 |
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
| 369 |
-
|
| 370 |
-
if show:
|
| 371 |
-
image.show()
|
| 372 |
-
else:
|
| 373 |
-
out_path: Path = (
|
| 374 |
-
Path(settings.debug.debug_output_path)
|
| 375 |
-
/ f"debug_{conv_res.input.file.stem}"
|
| 376 |
-
)
|
| 377 |
-
out_path.mkdir(parents=True, exist_ok=True)
|
| 378 |
-
|
| 379 |
-
out_file = out_path / f"doc_page_{page_no:05}.png"
|
| 380 |
-
image.save(str(out_file), format="png")
|
| 381 |
-
|
| 382 |
-
# for item in ds_doc.page_dimensions:
|
| 383 |
-
# page_no = item.page
|
| 384 |
-
# draw_clusters_and_cells(ds_doc, page_no)
|
| 385 |
-
|
| 386 |
-
return docling_doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/easyocr_model.py
DELETED
|
@@ -1,177 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import warnings
|
| 3 |
-
import zipfile
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Iterable, List, Optional
|
| 6 |
-
|
| 7 |
-
import numpy
|
| 8 |
-
from docling_core.types.doc import BoundingBox, CoordOrigin
|
| 9 |
-
|
| 10 |
-
from docling.datamodel.base_models import Cell, OcrCell, Page
|
| 11 |
-
from docling.datamodel.document import ConversionResult
|
| 12 |
-
from docling.datamodel.pipeline_options import (
|
| 13 |
-
AcceleratorDevice,
|
| 14 |
-
AcceleratorOptions,
|
| 15 |
-
EasyOcrOptions,
|
| 16 |
-
)
|
| 17 |
-
from docling.datamodel.settings import settings
|
| 18 |
-
from docling.models.base_ocr_model import BaseOcrModel
|
| 19 |
-
from docling.utils.accelerator_utils import decide_device
|
| 20 |
-
from docling.utils.profiling import TimeRecorder
|
| 21 |
-
from docling.utils.utils import download_url_with_progress
|
| 22 |
-
|
| 23 |
-
_log = logging.getLogger(__name__)
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
class EasyOcrModel(BaseOcrModel):
|
| 27 |
-
_model_repo_folder = "EasyOcr"
|
| 28 |
-
|
| 29 |
-
def __init__(
|
| 30 |
-
self,
|
| 31 |
-
enabled: bool,
|
| 32 |
-
artifacts_path: Optional[Path],
|
| 33 |
-
options: EasyOcrOptions,
|
| 34 |
-
accelerator_options: AcceleratorOptions,
|
| 35 |
-
):
|
| 36 |
-
super().__init__(enabled=enabled, options=options)
|
| 37 |
-
self.options: EasyOcrOptions
|
| 38 |
-
|
| 39 |
-
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
| 40 |
-
|
| 41 |
-
if self.enabled:
|
| 42 |
-
try:
|
| 43 |
-
import easyocr
|
| 44 |
-
except ImportError:
|
| 45 |
-
raise ImportError(
|
| 46 |
-
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
| 47 |
-
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
if self.options.use_gpu is None:
|
| 51 |
-
device = decide_device(accelerator_options.device)
|
| 52 |
-
# Enable easyocr GPU if running on CUDA, MPS
|
| 53 |
-
use_gpu = any(
|
| 54 |
-
[
|
| 55 |
-
device.startswith(x)
|
| 56 |
-
for x in [
|
| 57 |
-
AcceleratorDevice.CUDA.value,
|
| 58 |
-
AcceleratorDevice.MPS.value,
|
| 59 |
-
]
|
| 60 |
-
]
|
| 61 |
-
)
|
| 62 |
-
else:
|
| 63 |
-
warnings.warn(
|
| 64 |
-
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
|
| 65 |
-
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
|
| 66 |
-
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
|
| 67 |
-
)
|
| 68 |
-
use_gpu = self.options.use_gpu
|
| 69 |
-
|
| 70 |
-
download_enabled = self.options.download_enabled
|
| 71 |
-
model_storage_directory = self.options.model_storage_directory
|
| 72 |
-
if artifacts_path is not None and model_storage_directory is None:
|
| 73 |
-
download_enabled = False
|
| 74 |
-
model_storage_directory = str(artifacts_path / self._model_repo_folder)
|
| 75 |
-
|
| 76 |
-
self.reader = easyocr.Reader(
|
| 77 |
-
lang_list=self.options.lang,
|
| 78 |
-
gpu=use_gpu,
|
| 79 |
-
model_storage_directory=model_storage_directory,
|
| 80 |
-
recog_network=self.options.recog_network,
|
| 81 |
-
download_enabled=download_enabled,
|
| 82 |
-
verbose=False,
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
@staticmethod
|
| 86 |
-
def download_models(
|
| 87 |
-
detection_models: List[str] = ["craft"],
|
| 88 |
-
recognition_models: List[str] = ["english_g2", "latin_g2"],
|
| 89 |
-
local_dir: Optional[Path] = None,
|
| 90 |
-
force: bool = False,
|
| 91 |
-
progress: bool = False,
|
| 92 |
-
) -> Path:
|
| 93 |
-
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
| 94 |
-
from easyocr.config import detection_models as det_models_dict
|
| 95 |
-
from easyocr.config import recognition_models as rec_models_dict
|
| 96 |
-
|
| 97 |
-
if local_dir is None:
|
| 98 |
-
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
| 99 |
-
|
| 100 |
-
local_dir.mkdir(parents=True, exist_ok=True)
|
| 101 |
-
|
| 102 |
-
# Collect models to download
|
| 103 |
-
download_list = []
|
| 104 |
-
for model_name in detection_models:
|
| 105 |
-
if model_name in det_models_dict:
|
| 106 |
-
download_list.append(det_models_dict[model_name])
|
| 107 |
-
for model_name in recognition_models:
|
| 108 |
-
if model_name in rec_models_dict["gen2"]:
|
| 109 |
-
download_list.append(rec_models_dict["gen2"][model_name])
|
| 110 |
-
|
| 111 |
-
# Download models
|
| 112 |
-
for model_details in download_list:
|
| 113 |
-
buf = download_url_with_progress(model_details["url"], progress=progress)
|
| 114 |
-
with zipfile.ZipFile(buf, "r") as zip_ref:
|
| 115 |
-
zip_ref.extractall(local_dir)
|
| 116 |
-
|
| 117 |
-
return local_dir
|
| 118 |
-
|
| 119 |
-
def __call__(
|
| 120 |
-
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
| 121 |
-
) -> Iterable[Page]:
|
| 122 |
-
|
| 123 |
-
if not self.enabled:
|
| 124 |
-
yield from page_batch
|
| 125 |
-
return
|
| 126 |
-
|
| 127 |
-
for page in page_batch:
|
| 128 |
-
|
| 129 |
-
assert page._backend is not None
|
| 130 |
-
if not page._backend.is_valid():
|
| 131 |
-
yield page
|
| 132 |
-
else:
|
| 133 |
-
with TimeRecorder(conv_res, "ocr"):
|
| 134 |
-
ocr_rects = self.get_ocr_rects(page)
|
| 135 |
-
|
| 136 |
-
all_ocr_cells = []
|
| 137 |
-
for ocr_rect in ocr_rects:
|
| 138 |
-
# Skip zero area boxes
|
| 139 |
-
if ocr_rect.area() == 0:
|
| 140 |
-
continue
|
| 141 |
-
high_res_image = page._backend.get_page_image(
|
| 142 |
-
scale=self.scale, cropbox=ocr_rect
|
| 143 |
-
)
|
| 144 |
-
im = numpy.array(high_res_image)
|
| 145 |
-
result = self.reader.readtext(im)
|
| 146 |
-
|
| 147 |
-
del high_res_image
|
| 148 |
-
del im
|
| 149 |
-
|
| 150 |
-
cells = [
|
| 151 |
-
OcrCell(
|
| 152 |
-
id=ix,
|
| 153 |
-
text=line[1],
|
| 154 |
-
confidence=line[2],
|
| 155 |
-
bbox=BoundingBox.from_tuple(
|
| 156 |
-
coord=(
|
| 157 |
-
(line[0][0][0] / self.scale) + ocr_rect.l,
|
| 158 |
-
(line[0][0][1] / self.scale) + ocr_rect.t,
|
| 159 |
-
(line[0][2][0] / self.scale) + ocr_rect.l,
|
| 160 |
-
(line[0][2][1] / self.scale) + ocr_rect.t,
|
| 161 |
-
),
|
| 162 |
-
origin=CoordOrigin.TOPLEFT,
|
| 163 |
-
),
|
| 164 |
-
)
|
| 165 |
-
for ix, line in enumerate(result)
|
| 166 |
-
if line[2] >= self.options.confidence_threshold
|
| 167 |
-
]
|
| 168 |
-
all_ocr_cells.extend(cells)
|
| 169 |
-
|
| 170 |
-
# Post-process the cells
|
| 171 |
-
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
| 172 |
-
|
| 173 |
-
# DEBUG code:
|
| 174 |
-
if settings.debug.visualize_ocr:
|
| 175 |
-
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
| 176 |
-
|
| 177 |
-
yield page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Paper2Video/src/evaluation/PresentQuiz/docling/models/layout_model.py
DELETED
|
@@ -1,197 +0,0 @@
|
|
| 1 |
-
import copy
|
| 2 |
-
import logging
|
| 3 |
-
import warnings
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
from typing import Iterable, Optional, Union
|
| 6 |
-
|
| 7 |
-
from docling_core.types.doc import DocItemLabel
|
| 8 |
-
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
| 9 |
-
from PIL import Image
|
| 10 |
-
|
| 11 |
-
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
| 12 |
-
from docling.datamodel.document import ConversionResult
|
| 13 |
-
from docling.datamodel.pipeline_options import AcceleratorOptions
|
| 14 |
-
from docling.datamodel.settings import settings
|
| 15 |
-
from docling.models.base_model import BasePageModel
|
| 16 |
-
from docling.utils.accelerator_utils import decide_device
|
| 17 |
-
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
| 18 |
-
from docling.utils.profiling import TimeRecorder
|
| 19 |
-
from docling.utils.visualization import draw_clusters
|
| 20 |
-
|
| 21 |
-
_log = logging.getLogger(__name__)
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
class LayoutModel(BasePageModel):
|
| 25 |
-
_model_repo_folder = "ds4sd--docling-models"
|
| 26 |
-
_model_path = "model_artifacts/layout"
|
| 27 |
-
|
| 28 |
-
TEXT_ELEM_LABELS = [
|
| 29 |
-
DocItemLabel.TEXT,
|
| 30 |
-
DocItemLabel.FOOTNOTE,
|
| 31 |
-
DocItemLabel.CAPTION,
|
| 32 |
-
DocItemLabel.CHECKBOX_UNSELECTED,
|
| 33 |
-
DocItemLabel.CHECKBOX_SELECTED,
|
| 34 |
-
DocItemLabel.SECTION_HEADER,
|
| 35 |
-
DocItemLabel.PAGE_HEADER,
|
| 36 |
-
DocItemLabel.PAGE_FOOTER,
|
| 37 |
-
DocItemLabel.CODE,
|
| 38 |
-
DocItemLabel.LIST_ITEM,
|
| 39 |
-
DocItemLabel.FORMULA,
|
| 40 |
-
]
|
| 41 |
-
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
| 42 |
-
|
| 43 |
-
TABLE_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
|
| 44 |
-
FIGURE_LABEL = DocItemLabel.PICTURE
|
| 45 |
-
FORMULA_LABEL = DocItemLabel.FORMULA
|
| 46 |
-
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
| 47 |
-
|
| 48 |
-
def __init__(
|
| 49 |
-
self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
|
| 50 |
-
):
|
| 51 |
-
device = decide_device(accelerator_options.device)
|
| 52 |
-
|
| 53 |
-
if artifacts_path is None:
|
| 54 |
-
artifacts_path = self.download_models() / self._model_path
|
| 55 |
-
else:
|
| 56 |
-
# will become the default in the future
|
| 57 |
-
if (artifacts_path / self._model_repo_folder).exists():
|
| 58 |
-
artifacts_path = (
|
| 59 |
-
artifacts_path / self._model_repo_folder / self._model_path
|
| 60 |
-
)
|
| 61 |
-
elif (artifacts_path / self._model_path).exists():
|
| 62 |
-
warnings.warn(
|
| 63 |
-
"The usage of artifacts_path containing directly "
|
| 64 |
-
f"{self._model_path} is deprecated. Please point "
|
| 65 |
-
"the artifacts_path to the parent containing "
|
| 66 |
-
f"the {self._model_repo_folder} folder.",
|
| 67 |
-
DeprecationWarning,
|
| 68 |
-
stacklevel=3,
|
| 69 |
-
)
|
| 70 |
-
artifacts_path = artifacts_path / self._model_path
|
| 71 |
-
|
| 72 |
-
self.layout_predictor = LayoutPredictor(
|
| 73 |
-
artifact_path=str(artifacts_path),
|
| 74 |
-
device=device,
|
| 75 |
-
num_threads=accelerator_options.num_threads,
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
@staticmethod
|
| 79 |
-
def download_models(
|
| 80 |
-
local_dir: Optional[Path] = None,
|
| 81 |
-
force: bool = False,
|
| 82 |
-
progress: bool = False,
|
| 83 |
-
) -> Path:
|
| 84 |
-
from huggingface_hub import snapshot_download
|
| 85 |
-
from huggingface_hub.utils import disable_progress_bars
|
| 86 |
-
|
| 87 |
-
if not progress:
|
| 88 |
-
disable_progress_bars()
|
| 89 |
-
download_path = snapshot_download(
|
| 90 |
-
repo_id="ds4sd/docling-models",
|
| 91 |
-
force_download=force,
|
| 92 |
-
local_dir=local_dir,
|
| 93 |
-
revision="v2.1.0",
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
return Path(download_path)
|
| 97 |
-
|
| 98 |
-
def draw_clusters_and_cells_side_by_side(
|
| 99 |
-
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
| 100 |
-
):
|
| 101 |
-
"""
|
| 102 |
-
Draws a page image side by side with clusters filtered into two categories:
|
| 103 |
-
- Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
|
| 104 |
-
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
| 105 |
-
Includes label names and confidence scores for each cluster.
|
| 106 |
-
"""
|
| 107 |
-
scale_x = page.image.width / page.size.width
|
| 108 |
-
scale_y = page.image.height / page.size.height
|
| 109 |
-
|
| 110 |
-
# Filter clusters for left and right images
|
| 111 |
-
exclude_labels = {
|
| 112 |
-
DocItemLabel.FORM,
|
| 113 |
-
DocItemLabel.KEY_VALUE_REGION,
|
| 114 |
-
DocItemLabel.PICTURE,
|
| 115 |
-
}
|
| 116 |
-
left_clusters = [c for c in clusters if c.label not in exclude_labels]
|
| 117 |
-
right_clusters = [c for c in clusters if c.label in exclude_labels]
|
| 118 |
-
# Create a deep copy of the original image for both sides
|
| 119 |
-
left_image = copy.deepcopy(page.image)
|
| 120 |
-
right_image = copy.deepcopy(page.image)
|
| 121 |
-
|
| 122 |
-
# Draw clusters on both images
|
| 123 |
-
draw_clusters(left_image, left_clusters, scale_x, scale_y)
|
| 124 |
-
draw_clusters(right_image, right_clusters, scale_x, scale_y)
|
| 125 |
-
# Combine the images side by side
|
| 126 |
-
combined_width = left_image.width * 2
|
| 127 |
-
combined_height = left_image.height
|
| 128 |
-
combined_image = Image.new("RGB", (combined_width, combined_height))
|
| 129 |
-
combined_image.paste(left_image, (0, 0))
|
| 130 |
-
combined_image.paste(right_image, (left_image.width, 0))
|
| 131 |
-
if show:
|
| 132 |
-
combined_image.show()
|
| 133 |
-
else:
|
| 134 |
-
out_path: Path = (
|
| 135 |
-
Path(settings.debug.debug_output_path)
|
| 136 |
-
/ f"debug_{conv_res.input.file.stem}"
|
| 137 |
-
)
|
| 138 |
-
out_path.mkdir(parents=True, exist_ok=True)
|
| 139 |
-
out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
|
| 140 |
-
combined_image.save(str(out_file), format="png")
|
| 141 |
-
|
| 142 |
-
def __call__(
|
| 143 |
-
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
| 144 |
-
) -> Iterable[Page]:
|
| 145 |
-
|
| 146 |
-
for page in page_batch:
|
| 147 |
-
assert page._backend is not None
|
| 148 |
-
if not page._backend.is_valid():
|
| 149 |
-
yield page
|
| 150 |
-
else:
|
| 151 |
-
with TimeRecorder(conv_res, "layout"):
|
| 152 |
-
assert page.size is not None
|
| 153 |
-
page_image = page.get_image(scale=1.0)
|
| 154 |
-
assert page_image is not None
|
| 155 |
-
|
| 156 |
-
clusters = []
|
| 157 |
-
for ix, pred_item in enumerate(
|
| 158 |
-
self.layout_predictor.predict(page_image)
|
| 159 |
-
):
|
| 160 |
-
label = DocItemLabel(
|
| 161 |
-
pred_item["label"]
|
| 162 |
-
.lower()
|
| 163 |
-
.replace(" ", "_")
|
| 164 |
-
.replace("-", "_")
|
| 165 |
-
) # Temporary, until docling-ibm-model uses docling-core types
|
| 166 |
-
cluster = Cluster(
|
| 167 |
-
id=ix,
|
| 168 |
-
label=label,
|
| 169 |
-
confidence=pred_item["confidence"],
|
| 170 |
-
bbox=BoundingBox.model_validate(pred_item),
|
| 171 |
-
cells=[],
|
| 172 |
-
)
|
| 173 |
-
clusters.append(cluster)
|
| 174 |
-
|
| 175 |
-
if settings.debug.visualize_raw_layout:
|
| 176 |
-
self.draw_clusters_and_cells_side_by_side(
|
| 177 |
-
conv_res, page, clusters, mode_prefix="raw"
|
| 178 |
-
)
|
| 179 |
-
|
| 180 |
-
# Apply postprocessing
|
| 181 |
-
|
| 182 |
-
processed_clusters, processed_cells = LayoutPostprocessor(
|
| 183 |
-
page.cells, clusters, page.size
|
| 184 |
-
).postprocess()
|
| 185 |
-
# processed_clusters, processed_cells = clusters, page.cells
|
| 186 |
-
|
| 187 |
-
page.cells = processed_cells
|
| 188 |
-
page.predictions.layout = LayoutPrediction(
|
| 189 |
-
clusters=processed_clusters
|
| 190 |
-
)
|
| 191 |
-
|
| 192 |
-
if settings.debug.visualize_layout:
|
| 193 |
-
self.draw_clusters_and_cells_side_by_side(
|
| 194 |
-
conv_res, page, processed_clusters, mode_prefix="postprocessed"
|
| 195 |
-
)
|
| 196 |
-
|
| 197 |
-
yield page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|