Paper2Poster

Running

App Files Files Community

ZaynZhu commited on Nov 2

Commit

b447602

1 Parent(s): 44efbff

fix

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Paper2Video/LICENSE +0 -21
Paper2Video/README-CN.md +0 -248
Paper2Video/README.md +0 -251
Paper2Video/__init__.py +0 -0
Paper2Video/src/__init__.py +0 -0
Paper2Video/src/evaluation/IPMemory/construct.py +0 -69
Paper2Video/src/evaluation/IPMemory/ip_qa.py +0 -142
Paper2Video/src/evaluation/MetaSim_audio.py +0 -102
Paper2Video/src/evaluation/MetaSim_content.py +0 -144
Paper2Video/src/evaluation/PresentArena.py +0 -106
Paper2Video/src/evaluation/PresentQuiz/PresentQuiz.py +0 -264
Paper2Video/src/evaluation/PresentQuiz/create_paper_questions.py +0 -47
Paper2Video/src/evaluation/PresentQuiz/docling/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/backend/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/backend/abstract_backend.py +0 -63
Paper2Video/src/evaluation/PresentQuiz/docling/backend/asciidoc_backend.py +0 -430
Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_backend.py +0 -227
Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_v2_backend.py +0 -250
Paper2Video/src/evaluation/PresentQuiz/docling/backend/html_backend.py +0 -442
Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/docling_json_backend.py +0 -58
Paper2Video/src/evaluation/PresentQuiz/docling/backend/md_backend.py +0 -428
Paper2Video/src/evaluation/PresentQuiz/docling/backend/msexcel_backend.py +0 -386
Paper2Video/src/evaluation/PresentQuiz/docling/backend/mspowerpoint_backend.py +0 -424
Paper2Video/src/evaluation/PresentQuiz/docling/backend/msword_backend.py +0 -582
Paper2Video/src/evaluation/PresentQuiz/docling/backend/pdf_backend.py +0 -76
Paper2Video/src/evaluation/PresentQuiz/docling/backend/pypdfium2_backend.py +0 -260
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/pubmed_backend.py +0 -592
Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/uspto_backend.py +0 -1888
Paper2Video/src/evaluation/PresentQuiz/docling/chunking/__init__.py +0 -12
Paper2Video/src/evaluation/PresentQuiz/docling/cli/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/cli/main.py +0 -456
Paper2Video/src/evaluation/PresentQuiz/docling/cli/models.py +0 -107
Paper2Video/src/evaluation/PresentQuiz/docling/cli/tools.py +0 -17
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/base_models.py +0 -258
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/document.py +0 -394
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/pipeline_options.py +0 -296
Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/settings.py +0 -67
Paper2Video/src/evaluation/PresentQuiz/docling/document_converter.py +0 -348
Paper2Video/src/evaluation/PresentQuiz/docling/exceptions.py +0 -6
Paper2Video/src/evaluation/PresentQuiz/docling/models/__init__.py +0 -0
Paper2Video/src/evaluation/PresentQuiz/docling/models/base_model.py +0 -87
Paper2Video/src/evaluation/PresentQuiz/docling/models/base_ocr_model.py +0 -189
Paper2Video/src/evaluation/PresentQuiz/docling/models/code_formula_model.py +0 -251
Paper2Video/src/evaluation/PresentQuiz/docling/models/document_picture_classifier.py +0 -190
Paper2Video/src/evaluation/PresentQuiz/docling/models/ds_glm_model.py +0 -386
Paper2Video/src/evaluation/PresentQuiz/docling/models/easyocr_model.py +0 -177
Paper2Video/src/evaluation/PresentQuiz/docling/models/layout_model.py +0 -197

Paper2Video/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2025 Show Lab
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

Paper2Video/README-CN.md DELETED Viewed

@@ -1,248 +0,0 @@
-# Paper2Video
-<p align="right">
-  <a href="./README.md">English</a> | <b>简体中文</b>
-</p>
-<p align="center">
-  <b>Paper2Video: 从学术论文自动生成演讲视频</b>
-<br>
-<p align="center">
-  <a href="https://zeyu-zhu.github.io/webpage/">Zeyu Zhu*</a>,
-  <a href="https://qhlin.me/">Kevin Qinghong Lin*</a>,
-  <a href="https://scholar.google.com/citations?user=h1-3lSoAAAAJ&hl=en">Mike Zheng Shou</a> <br>
-  新加坡国立大学 Show Lab
-</p>
-<p align="center">
-  <a href="https://arxiv.org/abs/2510.05096">📄 论文</a> &nbsp; | &nbsp;
-  <a href="https://huggingface.co/papers/2510.05096">🤗 Daily Paper</a> &nbsp; | &nbsp;
-  <a href="https://huggingface.co/datasets/ZaynZhu/Paper2Video">📊 数据集</a> &nbsp; | &nbsp;
-  <a href="https://showlab.github.io/Paper2Video/">🌐 项目主页</a> &nbsp; | &nbsp;
-  <a href="https://x.com/KevinQHLin/status/1976105129146257542">💬 推特</a>
-</p>
-- **输入:** 一篇论文 ➕ 一张图像 ➕ 一段音频
-| 论文 | 图像 | 音频 |
-|--------|--------|--------|
-| <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/paper.png" width="180"/><br>[🔗 论文链接](https://arxiv.org/pdf/1509.01626) | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/hinton_head.jpeg" width="180"/> <br>Hinton的图像| <img src="assets/sound.png" width="180"/><br>[🔗 音频样本](https://github.com/showlab/Paper2Video/blob/page/assets/hinton/ref_audio_10.wav) |
-- **输出:** 演讲视频
-https://github.com/user-attachments/assets/39221a9a-48cb-4e20-9d1c-080a5d8379c4
-查看更多生成结果 [🌐 project page](https://showlab.github.io/Paper2Video/).
-## 🔥 Update
-- [x] [2025.10.11] 我们的工作在[YC Hacker News](https://news.ycombinator.com/item?id=45553701)上受到关注.
-- [x] [2025.10.9] 感谢AK在[Twitter](https://x.com/_akhaliq/status/1976099830004072849)上分享我们的工作!
-- [x] [2025.10.9] 我们的工作被 [Medium](https://medium.com/@dataism/how-ai-learned-to-make-scientific-videos-from-slides-to-a-talking-head-0d807e491b27)报道.
-- [x] [2025.10.8] 下方查看我们的demo视频!
-- [x] [2025.10.7] 我们发布了 [Arxiv 论文](https://arxiv.org/abs/2510.05096).
-- [x] [2025.10.6] 我们发布了 [代码](https://github.com/showlab/Paper2Video) and [数据集](https://huggingface.co/datasets/ZaynZhu/Paper2Video).
-- [x] [2025.9.28] Paper2Video 已经被 **Scaling Environments for Agents Workshop([SEA](https://sea-workshop.github.io/)) at NeurIPS 2025** 接受.
-https://github.com/user-attachments/assets/a655e3c7-9d76-4c48-b946-1068fdb6cdd9
----
-### Table of Contents
-- [🌟 项目总览](#-项目总览)
-- [🚀 快速上手: PaperTalker](#-快速上手-PaperTalker)
-  - [1. 环境配置](#1-环境配置)
-  - [2. 大语言模型配置](#2-大语言模型配置)
-  - [3. 推理](#3-推理)
-- [📊 评价指标: Paper2Video](#-评价指标-Paper2Video)
-- [😼 乐趣: Paper2Video 生成 Paper2Video 演讲视频](#-乐趣-Paper2Video生成Paper2Video演讲视频)
-- [🙏 致谢](#-致谢)
-- [📌 引用](#-引用)
----
-## 🌟 项目总览
-<p align="center">
-  <img src="assets/teaser.png" alt="Overview" width="100%">
-</p>
-这项工作解决了学术演讲的两个核心问题:
-- **左边: 如何根据论文制作学术演讲?**
-  *PaperTalker* — 集成**幻灯片**、**字幕**、**光标**、**语音合成**和**演讲者视频渲染**的多智能体。
-- **右边: 如何评估学术演讲视频?**
-  *Paper2Video* — 一个具有精心设计的指标来评估演示质量的基准。
----
-## 🚀 尝试 PaperTalker 为你的论文制作演讲视频 !
-<p align="center">
-  <img src="assets/method.png" alt="Approach" width="100%">
-</p>
-### 1. 环境配置
-准备Python环境:
-```bash
-cd src
-conda create -n p2v python=3.10
-conda activate p2v
-pip install -r requirements.txt
-conda install -c conda-forge tectonic
-````
-下载所依赖代码，并按照[Hallo2](https://github.com/fudan-generative-vision/hallo2)中的说明下载模型权重。
-```bash
-git clone https://github.com/fudan-generative-vision/hallo2.git
-```
-您需要**单独准备用于 talking-head generation 的环境**，以避免潜在的软件包冲突，请参考<a href="git clone https://github.com/fudan-generative-vision/hallo2.git">Hallo2</a>。安装完成后，使用 `which python` 命令获取 Python 环境路径。
-```bash
-cd hallo2
-conda create -n hallo python=3.10
-conda activate hallo
-pip install -r requirements.txt
-```
-### 2. 大语言模型配置
-在终端配置您的**API 凭证**:
-```bash
-export GEMINI_API_KEY="your_gemini_key_here"
-export OPENAI_API_KEY="your_openai_key_here"
-```
-最佳实践是针对 LLM 和 VLM 使用 **GPT4.1** 或 **Gemini2.5-Pro**。我们也支持本地部署开源模型（例如 Qwen），详情请参阅 <a href="https://github.com/Paper2Poster/Paper2Poster.git">Paper2Poster</a>。
-### 3. 推理
-脚本 `pipeline.py` 提供了一个自动化的学���演示视频生成流程。它以 **LaTeX 论文素材** 和 **参考图像/音频** 作为输入，并经过多个子模块（幻灯片 → 字幕 → 语音 → 光标 → 头部特写）生成完整的演示视频。⚡ 运行此流程的最低推荐 GPU 为 **NVIDIA A6000**，显存 48G。
-#### 示例用法
-运行以下命令来启动完整生成：
-```bash
-python pipeline.py \
-    --model_name_t gpt-4.1 \
-    --model_name_v gpt-4.1 \
-    --model_name_talking hallo2 \
-    --result_dir /path/to/output \
-    --paper_latex_root /path/to/latex_proj \
-    --ref_img /path/to/ref_img.png \
-    --ref_audio /path/to/ref_audio.wav \
-    --talking_head_env /path/to/hallo2_env \
-    --gpu_list [0,1,2,3,4,5,6,7]
-```
-| 参数名 | 类型 | 默认值 | 说明 |
-|----------|------|---------|-------------|
-| `--model_name_t` | `str` | `gpt-4.1` | 文本大语言模型（LLM） |
-| `--model_name_v` | `str` | `gpt-4.1` | 视觉语言模型（VLM） |
-| `--model_name_talking` | `str` | `hallo2` | Talking Head 模型。目前仅支持 **hallo2** |
-| `--result_dir` | `str` | `/path/to/output` | 输出目录（包括幻灯片、字幕、视频等） |
-| `--paper_latex_root` | `str` | `/path/to/latex_proj` | 论文 LaTeX 项目的根目录 |
-| `--ref_img` | `str` | `/path/to/ref_img.png` | 参考图像（必须为**正方形**人像） |
-| `--ref_audio` | `str` | `/path/to/ref_audio.wav` | 参考音频（建议时长约为 10 秒） |
-| `--ref_text` | `str` | `None` | 可选参考文本（用于字幕风格指导） |
-| `--beamer_templete_prompt` | `str` | `None` | 可选参考文本（用于幻灯片风格指导） |
-| `--gpu_list` | `list[int]` | `""` | GPU 列表，用于并行执行（适用于**光标生成**与 **Talking Head 渲染**） |
-| `--if_tree_search` | `bool` | `True` | 是否启用树搜索（用于幻灯片布局优化） |
-| `--stage` | `str` | `"[0]"` | 需要运行的阶段（例如 `[0]` 表示完整流程，`[1,2,3]` 表示部分阶段） |
-| `--talking_head_env` | `str` | `/path/to/hallo2_env` | Talking Head 生成的 Python 环境路径 |
----
-## 📊 评价指标: Paper2Video
-<p align="center">
-  <img src="assets/metrics.png" alt="Metrics" width="100%">
-</p>
-与自然视频生成不同，学术演示视频发挥着高度专业化的作用：它们不仅关乎视觉保真度，更关乎**学术交流**。这使得直接应用视频合成中的传统指标（例如 FVD、IS 或基于 CLIP 的相似度）变得困难。相反，它们的价值在于它们如何有效地**传播研究成果**并**提升学术知名度**。从这个角度来看，我们认为，评判高质量的学术演示视频应该从两个互补的维度进行评判：
-#### 对于观众
-- 视频应**忠实传达论文的核心思想**。
-- 视频应**易于不同受众观看**。
-#### 对于作者
-- 视频应**突出作者的智力贡献和身份**。
-- 视频应**提升作品的知名度和影响力**。
-为了实现这些目标，我们引入了专门为学术演示视频设计的评估指标：Meta Similarity, PresentArena, PresentQuiz, IP Memory.
-### 运行评价
-- 准备环境：
-```bash
-cd src/evaluation
-conda create -n p2v_e python=3.10
-conda activate p2v_e
-pip install -r requirements.txt
-```
-- 对于 Meta Similarity 和 PresentArena：
-```bash
-python MetaSim_audio.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-python MetaSim_content.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-```
-```bash
-python PresentArena.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-```
-- 对于**PresentQuiz**，首先基于论文生成问题并使用 Gemini 进行评估：
-```bash
-cd PresentQuiz
-python create_paper_questions.py ----paper_folder /path/to/data
-python PresentQuiz.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-```
-- 对于**IP Memory**，首先从生成的视频中生成问题对，然后使用 Gemini 进行评估：
-```bash
-cd IPMemory
-python construct.py
-python ip_qa.py
-```
-更多详情请查看代码！
-👉 Paper2Video 数据集可在以下网址获取：
-[HuggingFace](https://huggingface.co/datasets/ZaynZhu/Paper2Video)
----
-## 😼 乐趣: Paper2Video 生成 Paper2Video 演讲视频
-查看 **Paper2Video 生成 Paper2Video 演讲视频**:
-https://github.com/user-attachments/assets/ff58f4d8-8376-4e12-b967-711118adf3c4
-## 🙏 致谢
-* 数据集中演示视频的来源是 SlideLive 和 YouTube。
-* 感谢所有为制作演示视频付出辛勤努力的作者！
-* 感谢 [CAMEL](https://github.com/camel-ai/camel) 开源了组织良好的多智能体框架代码库。
-* 感谢 [Hallo2](https://github.com/fudan-generative-vision/hallo2.git) 和 [Paper2Poster](https://github.com/Paper2Poster/Paper2Poster.git) 作者开源代码。
-* 感谢 [Wei Jia](https://github.com/weeadd) 在数据收集和baselines实现方面所做的努力。我们也感谢所有参与用户调研的参与者。
-* 感谢所有 **Show Lab @ NUS** 成员的支持！
----
-## ��� 引用
-如果我们的工作对您有帮助，欢迎引用我们的工作：
-```bibtex
-@misc{paper2video,
-      title={Paper2Video: Automatic Video Generation from Scientific Papers},
-      author={Zeyu Zhu and Kevin Qinghong Lin and Mike Zheng Shou},
-      year={2025},
-      eprint={2510.05096},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2510.05096},
-}
-```

Paper2Video/README.md DELETED Viewed

@@ -1,251 +0,0 @@
-# Paper2Video
-<p align="right">
-  <b>English</b> | <a href="./README-CN.md">简体中文</a>
-</p>
-<p align="center">
-  <b>Paper2Video: Automatic Video Generation from Scientific Papers</b>
-<br>
-从学术论文自动生成演讲视频
-</p>
-<p align="center">
-  <a href="https://zeyu-zhu.github.io/webpage/">Zeyu Zhu*</a>,
-  <a href="https://qhlin.me/">Kevin Qinghong Lin*</a>,
-  <a href="https://scholar.google.com/citations?user=h1-3lSoAAAAJ&hl=en">Mike Zheng Shou</a> <br>
-  Show Lab, National University of Singapore
-</p>
-<p align="center">
-  <a href="https://arxiv.org/abs/2510.05096">📄 Paper</a> &nbsp; | &nbsp;
-  <a href="https://huggingface.co/papers/2510.05096">🤗 Daily Paper</a> &nbsp; | &nbsp;
-  <a href="https://huggingface.co/datasets/ZaynZhu/Paper2Video">📊 Dataset</a> &nbsp; | &nbsp;
-  <a href="https://showlab.github.io/Paper2Video/">🌐 Project Website</a> &nbsp; | &nbsp;
-  <a href="https://x.com/KevinQHLin/status/1976105129146257542">💬 X (Twitter)</a>
-</p>
-- **Input:** a paper ➕ an image ➕ an audio
-| Paper | Image | Audio |
-|--------|--------|--------|
-| <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/paper.png" width="180"/><br>[🔗 Paper link](https://arxiv.org/pdf/1509.01626) | <img src="https://github.com/showlab/Paper2Video/blob/page/assets/hinton/hinton_head.jpeg" width="180"/> <br>Hinton's photo| <img src="assets/sound.png" width="180"/><br>[🔗 Audio sample](https://github.com/showlab/Paper2Video/blob/page/assets/hinton/ref_audio_10.wav) |
-- **Output:** a presentation video
-https://github.com/user-attachments/assets/39221a9a-48cb-4e20-9d1c-080a5d8379c4
-Check out more examples at [🌐 project page](https://showlab.github.io/Paper2Video/).
-## 🔥 Update
-- [x] [2025.10.11] Our work receives attention on [YC Hacker News](https://news.ycombinator.com/item?id=45553701).
-- [x] [2025.10.9] Thanks AK for sharing our work on [Twitter](https://x.com/_akhaliq/status/1976099830004072849)!
-- [x] [2025.10.9] Our work is reported by [Medium](https://medium.com/@dataism/how-ai-learned-to-make-scientific-videos-from-slides-to-a-talking-head-0d807e491b27).
-- [x] [2025.10.8] Check out our demo video below!
-- [x] [2025.10.7] We release the [arxiv paper](https://arxiv.org/abs/2510.05096).
-- [x] [2025.10.6] We release the [code](https://github.com/showlab/Paper2Video) and [dataset](https://huggingface.co/datasets/ZaynZhu/Paper2Video).
-- [x] [2025.9.28] Paper2Video has been accepted to the **Scaling Environments for Agents Workshop([SEA](https://sea-workshop.github.io/)) at NeurIPS 2025**.
-https://github.com/user-attachments/assets/a655e3c7-9d76-4c48-b946-1068fdb6cdd9
----
-### Table of Contents
-- [🌟 Overview](#-overview)
-- [🚀 Quick Start: PaperTalker](#-try-papertalker-for-your-paper-)
-  - [1. Requirements](#1-requirements)
-  - [2. Configure LLMs](#2-configure-llms)
-  - [3. Inference](#3-inference)
-- [📊 Evaluation: Paper2Video](#-evaluation-paper2video)
-- [😼 Fun: Paper2Video for Paper2Video](#-fun-paper2video-for-paper2video)
-- [🙏 Acknowledgements](#-acknowledgements)
-- [📌 Citation](#-citation)
----
-## 🌟 Overview
-<p align="center">
-  <img src="assets/teaser.png" alt="Overview" width="100%">
-</p>
-This work solves two core problems for academic presentations:
-- **Left: How to create a presentation video from a paper?**
-  *PaperTalker* — an agent that integrates **slides**, **subtitling**, **cursor grounding**, **speech synthesis**, and **talking-head video rendering**.
-- **Right: How to evaluate a presentation video?**
-  *Paper2Video* — a benchmark with well-designed metrics to evaluate presentation quality.
----
-## 🚀 Try PaperTalker for your Paper!
-<p align="center">
-  <img src="assets/method.png" alt="Approach" width="100%">
-</p>
-### 1. Requirements
-Prepare the environment:
-```bash
-cd src
-conda create -n p2v python=3.10
-conda activate p2v
-pip install -r requirements.txt
-conda install -c conda-forge tectonic
-````
-Download the dependent code and follow the instructions in **[Hallo2](https://github.com/fudan-generative-vision/hallo2)** to download the model weight.
-```bash
-git clone https://github.com/fudan-generative-vision/hallo2.git
-```
-You need to **prepare the environment separately for talking-head generation** to potential avoide package conflicts, please refer to  <a href="git clone https://github.com/fudan-generative-vision/hallo2.git">Hallo2</a>. After installing, use `which python` to get the python environment path.
-```bash
-cd hallo2
-conda create -n hallo python=3.10
-conda activate hallo
-pip install -r requirements.txt
-```
-### 2. Configure LLMs
-Export your **API credentials**:
-```bash
-export GEMINI_API_KEY="your_gemini_key_here"
-export OPENAI_API_KEY="your_openai_key_here"
-```
-The best practice is to use **GPT4.1** or **Gemini2.5-Pro** for both LLM and VLMs. We also support locally deployed open-source model(e.g., Qwen), details please referring to <a href="https://github.com/Paper2Poster/Paper2Poster.git">Paper2Poster</a>.
-### 3. Inference
-The script `pipeline.py` provides an automated pipeline for generating academic presentation videos. It takes **LaTeX paper sources** together with **reference image/audio** as input, and goes through multiple sub-modules (Slides → Subtitles → Speech → Cursor → Talking Head) to produce a complete presentation video. ⚡ The minimum recommended GPU for running this pipeline is **NVIDIA A6000** with 48G.
-#### Example Usage
-Run the following command to launch a full generation:
-```bash
-python pipeline.py \
-    --model_name_t gpt-4.1 \
-    --model_name_v gpt-4.1 \
-    --model_name_talking hallo2 \
-    --result_dir /path/to/output \
-    --paper_latex_root /path/to/latex_proj \
-    --ref_img /path/to/ref_img.png \
-    --ref_audio /path/to/ref_audio.wav \
-    --talking_head_env /path/to/hallo2_env \
-    --gpu_list [0,1,2,3,4,5,6,7]
-```
-| Argument | Type | Default | Description |
-|----------|------|---------|-------------|
-| `--model_name_t` | `str` | `gpt-4.1` | LLM |
-| `--model_name_v` | `str` | `gpt-4.1` | VLM |
-| `--model_name_talking` | `str` | `hallo2` | Talking Head model. Currently only **hallo2** is supported |
-| `--result_dir` | `str` | `/path/to/output` | Output directory (slides, subtitles, videos, etc.) |
-| `--paper_latex_root` | `str` | `/path/to/latex_proj` | Root directory of the LaTeX paper project |
-| `--ref_img` | `str` | `/path/to/ref_img.png` | Reference image (must be **square** portrait) |
-| `--ref_audio` | `str` | `/path/to/ref_audio.wav` | Reference audio (recommended: ~10s) |
-| `--ref_text` | `str` | `None` | Optional reference text (for style guidance for subtitles) |
-| `--beamer_templete_prompt` | `str` | `None` | Optional reference text (for style guidance for slides) |
-| `--gpu_list` | `list[int]` | `""` | GPU list for parallel execution (used in **cursor generation** and **Talking Head rendering**) |
-| `--if_tree_search` | `bool` | `True` | Whether to enable tree search for slide layout refinement |
-| `--stage` | `str` | `"[0]"` | Pipeline stages to run (e.g., `[0]` full pipeline, `[1,2,3]` partial stages) |
-| `--talking_head_env` | `str` | `/path/to/hallo2_env` | python environment path for talking-head generation |
----
-## 📊 Evaluation: Paper2Video
-<p align="center">
-  <img src="assets/metrics.png" alt="Metrics" width="100%">
-</p>
-Unlike natural video generation, academic presentation videos serve a highly specialized role: they are not merely about visual fidelity but about **communicating scholarship**. This makes it difficult to directly apply conventional metrics from video synthesis(e.g., FVD, IS, or CLIP-based similarity). Instead, their value lies in how well they **disseminate research** and **amplify scholarly visibility**.From this perspective, we argue that a high-quality academic presentation video should be judged along two complementary dimensions:
-#### For the Audience
-- The video is expected to **faithfully convey the paper’s core ideas**.
-- It should remain **accessible to diverse audiences**.
-#### For the Author
-- The video should **foreground the authors’ intellectual contribution and identity**.
-- It should **enhance the work’s visibility and impact**.
-To capture these goals, we introduce evaluation metrics specifically designed for academic presentation videos: Meta Similarity, PresentArena, PresentQuiz, IP Memory.
-### Run Eval
-- Prepare the environment:
-```bash
-cd src/evaluation
-conda create -n p2v_e python=3.10
-conda activate p2v_e
-pip install -r requirements.txt
-```
-- For MetaSimilarity and PresentArena:
-```bash
-python MetaSim_audio.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-python MetaSim_content.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-```
-```bash
-python PresentArena.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-```
-- For **PresentQuiz**, first generate questions from paper and eval using Gemini:
-```bash
-cd PresentQuiz
-python create_paper_questions.py ----paper_folder /path/to/data
-python PresentQuiz.py --r /path/to/result_dir --g /path/to/gt_dir --s /path/to/save_dir
-```
-- For **IP Memory**, first generate question pairs from generated videos and eval using Gemini:
-```bash
-cd IPMemory
-python construct.py
-python ip_qa.py
-```
-See the codes for more details!
-👉 Paper2Video Benchmark is available at:
-[HuggingFace](https://huggingface.co/datasets/ZaynZhu/Paper2Video)
----
-## 😼 Fun: Paper2Video for Paper2Video
-Check out **How Paper2Video for Paper2Video**:
-https://github.com/user-attachments/assets/ff58f4d8-8376-4e12-b967-711118adf3c4
-## 🙏 Acknowledgements
-* The souces of the presentation videos are SlideLive and YouTuBe.
-* We thank all the authors who spend a great effort to create presentation videos!
-* We thank [CAMEL](https://github.com/camel-ai/camel) for open-source well-organized multi-agent framework codebase.
-* We thank the authors of [Hallo2](https://github.com/fudan-generative-vision/hallo2.git) and [Paper2Poster](https://github.com/Paper2Poster/Paper2Poster.git) for their open-sourced codes.
-* We thank [Wei Jia](https://github.com/weeadd) for his effort in collecting the data and implementing the baselines. We also thank all the participants involved in the human studies.
-* We thank all the **Show Lab @ NUS** members for support!
----
-## 📌 Citation
-If you find our work useful, please cite:
-```bibtex
-@misc{paper2video,
-      title={Paper2Video: Automatic Video Generation from Scientific Papers},
-      author={Zeyu Zhu and Kevin Qinghong Lin and Mike Zheng Shou},
-      year={2025},
-      eprint={2510.05096},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2510.05096},
-}
-```
-[![Star History](https://api.star-history.com/svg?repos=showlab/Paper2Video&type=Date)](https://star-history.com/#showlab/Paper2Video&Date)

Paper2Video/__init__.py DELETED Viewed

File without changes

Paper2Video/src/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/IPMemory/construct.py DELETED Viewed

@@ -1,69 +0,0 @@
-"""
-    construct question about Academic IP
-    input query: 4 video clips from 4 different paper presentation + query (image/audio)
-    input question: 4 understanding qa from corresponding paper
-    output task: choose the right question to ask
-"""
-import os, re
-import json
-import random
-import itertools
-from os import path
-from typing import List
-from pathlib import Path
-from tqdm import tqdm
-def generate_combinations(total_num, comb_size):
-    return list(itertools.combinations(range(total_num), comb_size))
-def generate_ip_task(vaild_data_name, num_qa_pair):
-    combs = list(itertools.combinations(range(len(vaild_data_name)), 4))
-    combs = random.sample(combs, num_qa_pair)
-    qa_list = []
-    for comb in combs:
-        ## questions
-        question_list = []
-        question_index = random.randint(1, 50)
-        for index in comb:
-            question_path = path.join(vaild_data_name[index][1], "4o-mini_qa.json")
-            with open(question_path, 'r') as f: question = json.load(f)["understanding"]["questions"]
-            question_list.append(question["Question {}".format(str(question_index))]["question"])
-        ## query
-        query_list = []
-        for index in comb:
-            ref_img_path = path.join(vaild_data_name[index][1], "ref_img.png")
-            ref_audio_path = path.join(vaild_data_name[index][1], "ref_audio.wav")
-            query_list.append((ref_img_path, ref_audio_path))
-        ## qa
-        qa = {}
-        qa["videos"] = []
-        for idx in range(len(comb)):
-            qa["videos"].append(vaild_data_name[comb[idx]][0])
-        qa["querys"] = query_list
-        qa["questions"] = question_list
-        qa_list.append(qa)
-    with open("ip_qa.json", 'w') as f: json.dump(qa_list, f, indent=4)
-_num_at_start = re.compile(r'^\s*["\']?(\d+)')
-def sort_by_leading_number(paths: List[str]) -> List[str]:
-    def key(p: str):
-        name = Path(p).name
-        m = _num_at_start.match(name)
-        return (int(m.group(1)) if m else float('inf'), name)
-    return sorted(paths, key=key)
-if __name__ == "__main__":
-    num_qa_pair = 10 # C (num_data) (4)
-    root_dir = "/path/to/result"
-    gt_dir = "/path/to/data"
-    all_data_name = sort_by_leading_number(os.listdir(root_dir))
-    all_groundtruth = sort_by_leading_number(os.listdir(gt_dir))
-    vaild_data_name = []
-    for data_idx in range(len(all_data_name)):
-        if path.basename(root_dir) == "paper2video":
-            video_result_1 = path.join(root_dir, all_data_name[data_idx], "3_merage.mp4")
-            video_result_2 = path.join(root_dir.replace("paper2video", "presentagent"), all_data_name[data_idx], "result.mp4")
-    generate_ip_task(vaild_data_name, num_qa_pair)

Paper2Video/src/evaluation/IPMemory/ip_qa.py DELETED Viewed

@@ -1,142 +0,0 @@
-import os
-import re
-import json
-import time
-import random
-import argparse, pdb
-from os import path
-import google.generativeai as genai
-from moviepy.editor import VideoFileClip
-from camel.models import ModelFactory
-from camel.types import ModelType, ModelPlatformType
-from camel.configs import GeminiConfig
-from typing import List
-from pathlib import Path
-genai.configure(api_key="")
-_num_at_start = re.compile(r'^\s*["\']?(\d+)')
-def sort_by_leading_number(paths: List[str]) -> List[str]:
-    def key(p: str):
-        name = Path(p).name
-        m = _num_at_start.match(name)
-        return (int(m.group(1)) if m else float('inf'), name)
-    return sorted(paths, key=key)
-dataset_path = "/path/to/data"
-dataset_list = sort_by_leading_number(os.listdir(dataset_path))
-def eval_ip(root_path, clip_duration, model_list, prompt_path, question_path, test_type='image'):
-    tmp_dir = "tmp"
-    os.makedirs(tmp_dir, exist_ok=True)
-    gemini_model = genai.GenerativeModel("models/gemini-2.5-pro-flash")
-    with open(prompt_path, 'r') as f: prompt = f.readlines()
-    prompt = "/n".join(prompt)
-    with open(question_path, 'r') as f: questions = json.load(f)
-    result_each_question = []
-    for question in questions:
-        video_ids = question["videos"]
-        querys = question["querys"]
-        qs = question["questions"]
-        ## get video clips
-        video_clips_path = {}
-        for model in model_list: video_clips_path[model] = []
-        start_p2v = None
-        for vid_id in video_ids:
-            tmp_dir_id = path.join(tmp_dir, str(vid_id))
-            os.makedirs(tmp_dir_id, exist_ok=True)
-            for model in model_list:
-                if model == 'p2v': video_path = path.join(root_path, "paper2video", str(vid_id), '3_merage.mp4')
-                elif model == 'p2v-o': video_path = path.join(root_path, "paper2video_wo_presenter", str(vid_id), 'result.mp4')
-                elif model == 'veo3': video_path = path.join(root_path, "veo3", str(vid_id)+".mp4")
-                elif model == 'wan2.2': video_path = path.join(root_path, "wan2.2", str(int(vid_id)-1), "result.mp4")
-                elif model == 'presentagent': video_path = path.join(root_path, "presentagent", str(vid_id), "result.mp4")
-                elif model == 'human-made': video_path = path.join(dataset_path, dataset_list[int(vid_id)-1], "gt_presentation_video.mp4")
-                video = VideoFileClip(video_path)
-                start = random.uniform(0, video.duration-clip_duration-1)
-                end = start + clip_duration
-                if model == 'p2v' or model == "p2v-o":
-                    if start_p2v is None:
-                        start_p2v = random.uniform(0, video.duration-clip_duration-1)
-                        start = start_p2v
-                        end = start_p2v + clip_duration
-                    else:
-                        start = start_p2v
-                        end = start_p2v + clip_duration
-                else:
-                    start = random.uniform(0, video.duration-clip_duration-1)
-                    end = start + clip_duration
-                clip_save_path = path.join(tmp_dir_id, model+".mp4")
-                subclip = video.subclip(start, end)
-                subclip.write_videofile(clip_save_path, codec="libx264", audio_codec="aac")
-                video_clips_path[model].append(clip_save_path)
-        ## test for each model, 4 qas
-        result_each_model = {}
-        for model in model_list:
-            video_input = video_clips_path[model]
-            videos = upload_videos(video_input)
-            result_each_model[model] = []
-            for idx, query in enumerate(querys):
-                if test_type == 'image':
-                    query = query[0]
-                    query_state = genai.upload_file(path=query, mime_type="image/png")
-                elif test_type == 'aduio':
-                    query = query[1]
-                answer = idx
-                ori_idxs = [0, 1, 2, 3]
-                shuffled_idx = ori_idxs.copy()
-                random.shuffle(shuffled_idx)
-                mapping = {orig: shuffled for orig, shuffled in zip(ori_idxs, shuffled_idx)}
-                new_answer = mapping[idx]
-                new_qs = [qs[mapping[idx]] for idx in ori_idxs]
-                contents = [prompt, "Here are the quary", genai.get_file(query_state.name), "Here are the video clips"]
-                contents.extend(videos)
-                contents.extend(["Here are the questions"])
-                contents.extend(new_qs)
-                response = gemini_model.generate_content(contents)
-                #pdb.set_trace()
-                match = re.search(r"My choice:\s*(\d+)", response.text)
-                if match: choice_num = int(match.group(1)) - 1
-                if choice_num == new_answer:
-                    result_each_model[model].append([query, new_qs, choice_num, new_answer, True])
-                else:
-                    result_each_model[model].append([query, new_qs, choice_num, new_answer, False])
-        result_each_question.append(result_each_model)
-        print(result_each_question)
-    with open("ip_qa_result.json", 'w') as f: json.dump(result_each_question, f, indent=4)
-def upload_videos(video_list):
-    videos = video_list.copy()
-    for idx, value in enumerate(videos):
-        videos[idx] = genai.upload_file(path=value, mime_type="video/mp4")
-    while True:
-        flag = True
-        for idx, value in enumerate(videos):
-            file_state = genai.get_file(videos[idx].name)
-            if file_state.state.name != "ACTIVE":
-                flag = False
-                time.sleep(5)
-                print(f"waiting 5 seconds...")
-                break
-        if flag: break
-    for idx, value in enumerate(videos):
-        videos[idx] = genai.get_file(videos[idx].name)
-    return videos
-if __name__ == "__main__":
-    clip_duration = 4
-    prompt_path = "./prompt/ip_qa.txt"
-    model_list = ["p2v", "p2v-o", "veo3", "wan2.2", "presentagent", "human-made"]
-    root_path = "/path/to/result"
-    question_path = "ip_qa.json"
-    eval_ip(root_path, clip_duration, model_list, prompt_path, question_path)

Paper2Video/src/evaluation/MetaSim_audio.py DELETED Viewed

@@ -1,102 +0,0 @@
-import os, re, json
-import random
-import argparse
-import moviepy.editor as mp
-from os import path
-from pathlib import Path
-from typing import List
-from pyannote.audio import Audio
-from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
-from scipy.spatial.distance import cosine
-def extract_random_audio_segment(video_path: str, output_wav_path: str, duration: float = 5.0):
-    print(video_path)
-    video = mp.VideoFileClip(video_path)
-    audio = video.audio
-    total_duration = audio.duration
-    if duration >= total_duration: start_time = 0
-    else: start_time = random.uniform(0, total_duration - duration)
-    audio_subclip = audio.subclip(start_time, start_time + duration)
-    audio_subclip.write_audiofile(output_wav_path, codec='pcm_s16le', fps=16000)
-def compute_speaker_similarity(audio_path_1: str, audio_path_2: str, device: str = "cuda") -> float:
-    embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=device)
-    audio_loader = Audio(sample_rate=16000)
-    wav1, _ = audio_loader(audio_path_1)
-    wav2, _ = audio_loader(audio_path_2)
-    wav1 = wav1[0:1].unsqueeze(0)
-    wav2 = wav2[0:1].unsqueeze(0)
-    embedding1 = embedding_model(wav1)
-    embedding2 = embedding_model(wav2)
-    embedding1 = embedding1.reshape(embedding1.shape[1])
-    embedding2 = embedding2.reshape(embedding2.shape[1])
-    similarity = 1 - cosine(embedding1, embedding2)
-    return similarity
-def get_audio_sim_score(gen_video_path, gt_video_path):
-    extract_random_audio_segment(gen_video_path, gen_video_path.replace('.mp4', '.wav'), duration=5)
-    extract_random_audio_segment(gt_video_path, gt_video_path.replace('.mp4', '.wav'), duration=5)
-    similarity = compute_speaker_similarity(gen_video_path.replace('.mp4', '.wav'),
-                                            gt_video_path.replace('.mp4', '.wav'))
-    return similarity
-_num_at_start = re.compile(r'^\s*["\']?(\d+)')
-def sort_by_leading_number(paths: List[str]) -> List[str]:
-    def key(p: str):
-        name = Path(p).name
-        m = _num_at_start.match(name)
-        return (int(m.group(1)) if m else float('inf'), name)
-    return sorted(paths, key=key)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
-    parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
-    parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
-    args = parser.parse_args()
-    ## load exist result if have
-    save_dir = args.save_dir
-    save_dir = path.join(save_dir, path.basename(args.result_dir))
-    save_path = path.join(save_dir, "audio_sim.json")
-    os.makedirs(save_dir, exist_ok=True)
-    if path.exists(save_path):
-        with open(save_path, 'r') as f: audio_similarity_list = json.load(f)
-    else: audio_similarity_list = []
-    ## path
-    gt_dir, result_dir = args.gt_dir, args.result_dir
-    groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
-    result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
-    for index in range(len(audio_similarity_list), 40):
-        if path.basename(args.result_dir) == "paper2video":
-            p2v_video_path = path.join(result_list[index], "3_merage.mp4")
-        elif path.basename(args.result_dir) == "veo3":
-            p2v_video_path = path.join(result_list[index])
-        else:
-            p2v_video_path = path.join(result_list[index], "result.mp4")
-        if path.exists(p2v_video_path) is False: continue
-        gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
-        if path.exists(gt_video_path) is False: continue
-        print(p2v_video_path, gt_video_path)
-        similarity = get_audio_sim_score(p2v_video_path, gt_video_path)
-        audio_similarity_list.append({
-            "data_idx": index,
-            "score": similarity.item()
-        })
-    print(audio_similarity_list)
-    with open(save_path, 'w') as f: json.dump(audio_similarity_list, f, indent=4)
-    # import numpy as np
-    # avg = np.average(similarity_all)
-    # var = np.var(similarity_all)
-    # print(avg, var)

Paper2Video/src/evaluation/MetaSim_content.py DELETED Viewed

@@ -1,144 +0,0 @@
-import os, re, pdb, json
-from PIL import Image
-import pytesseract
-import whisperx
-import argparse
-import torch
-import numpy as np
-from os import path
-from pathlib import Path
-from typing import List
-from camel.models import ModelFactory
-from camel.types import ModelType, ModelPlatformType
-from camel.configs import GeminiConfig
-os.environ["GEMINI_API_KEY"] = ""
-prompt_path = "./prompt/content_sim_score.txt"
-agent_config = {
-    "model_type": ModelType.GEMINI_2_5_FLASH,
-    "model_config": GeminiConfig().as_dict(),
-    "model_platform": ModelPlatformType.GEMINI,}
-actor_model = ModelFactory.create(
-    model_platform=agent_config['model_platform'],
-    model_type=agent_config['model_type'],
-    model_config_dict=agent_config['model_config'],)
-def extract_slide_texts(slide_dir):
-    slide_texts = []
-    for fname in sorted(os.listdir(slide_dir)):
-        if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
-            path = os.path.join(slide_dir, fname)
-            text = pytesseract.image_to_string(Image.open(path))
-            slide_texts.append(text.strip())
-    return slide_texts
-def load_subtitles(sub_path):
-    with open(sub_path, "r") as f:
-        lines = f.readlines()
-    return [line.strip() for line in lines if line.strip()]
-def build_prompt(slides_1, subs_1, slides_2, subs_2):
-    prompt = (
-        "Human Presentation:\n"
-        "Slides:\n" + "\n".join(slides_1) + "\n"
-        "Subtitles:\n" + "\n".join(subs_1) + "\n\n"
-        "Generated Presentation:\n"
-        "Slides:\n" + "\n".join(slides_2) + "\n"
-        "Subtitles:\n" + "\n".join(subs_2) + "\n\n")
-    return prompt
-def run_similarity_eval(slide_dir_1, slide_dir_2, sub_path_1, sub_path_2):
-    slides_1 = extract_slide_texts(slide_dir_1)
-    slides_2 = extract_slide_texts(slide_dir_2)
-    subs_1 = load_subtitles(sub_path_1)
-    subs_2 = load_subtitles(sub_path_2)
-    with open(prompt_path, 'r') as f: prompt = f.readlines()
-    prompt = "\n".join(prompt)
-    prompt_q = build_prompt(slides_1, subs_1, slides_2, subs_2)
-    prompt = prompt + '/n' + prompt_q
-    output = actor_model.run([{"role": "user", "content": prompt}])
-    print("=== Similarity Evaluation ===\n")
-    print(output.choices[0].message.content)
-    return output.choices[0].message.content
-def extract_plain_subtitle_with_whisperx(video_path: str, output_path: str, model_name: str = "large-v3", language: str = "en"):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = whisperx.load_model(model_name, device=device, language=language)
-    audio = whisperx.load_audio(video_path)
-    result = model.transcribe(audio, batch_size=16)
-    with open(output_path, "w") as f:
-        for seg in result["segments"]:
-            f.write(seg["text"].strip() + "\n")
-def extract_similarity_scores(text):
-    content_match = re.search(r"Content Similarity:\s*(\d+)/5", text)
-    if content_match:
-        content_score = int(content_match.group(1))
-        return content_score
-_num_at_start = re.compile(r'^\s*["\']?(\d+)')
-def sort_by_leading_number(paths: List[str]) -> List[str]:
-    def key(p: str):
-        name = Path(p).name
-        m = _num_at_start.match(name)
-        return (int(m.group(1)) if m else float('inf'), name)
-    return sorted(paths, key=key)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
-    parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
-    parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
-    args = parser.parse_args()
-    ## load exist result if have
-    save_dir = args.save_dir
-    save_dir = path.join(save_dir, path.basename(args.result_dir))
-    save_path = path.join(save_dir, "content_sim.json")
-    os.makedirs(save_dir, exist_ok=True)
-    if path.exists(save_path):
-        with open(save_path, 'r') as f: content_sim_list = json.load(f)
-    else: content_sim_list = []
-    ## path
-    gt_dir, result_dir = args.gt_dir, args.result_dir
-    groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
-    result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
-    ## eval
-    for index in range(25, 100):
-        # video -> subtitle
-        if path.basename(args.result_dir) == "paper2video":
-            p2v_video_path = path.join(result_list[index], "3_merage.mp4")
-            if path.exists(p2v_video_path) is False: continue
-        else:
-            p2v_video_path = path.join(result_list[index], "result.mp4")
-        if path.exists(p2v_video_path) is False: continue
-        gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
-        extract_plain_subtitle_with_whisperx(gt_video_path, gt_video_path.replace(".mp4", "_sub.txt"))
-        extract_plain_subtitle_with_whisperx(p2v_video_path, p2v_video_path.replace(".mp4", "_sub.txt"))
-        # slide dir
-        gt_slide_dir = path.join(groundtruth_list[index], "slide_imgs")
-        p2v_slide_dir = path.join(result_list[index], "slide_imgs")
-        # eval
-        result = run_similarity_eval(
-            slide_dir_1=gt_slide_dir,
-            slide_dir_2=p2v_slide_dir,
-            sub_path_1=gt_video_path.replace(".mp4", "_sub.txt"),
-            sub_path_2=p2v_video_path.replace(".mp4", "_sub.txt"))
-        content_score = extract_similarity_scores(result)
-        content_sim_list.append({
-            "data_idx": index,
-            "score": content_score
-        })
-        with open(save_path, 'w') as f: json.dump(content_sim_list, f)

Paper2Video/src/evaluation/PresentArena.py DELETED Viewed

@@ -1,106 +0,0 @@
-'''
-    Using VideoLLM (Gemini) as judger
-'''
-import os, re, json
-import time
-import argparse
-import google.generativeai as genai
-from os import path
-from typing import List
-from pathlib import Path
-from tqdm import tqdm
-genai.configure(api_key="")
-def eval_gemini(gt_vid_path, gen_vid_path):
-    model = genai.GenerativeModel("models/gemini-2.5-pro")
-    gt_vid = genai.upload_file(path=gt_vid_path, mime_type="video/mp4")
-    gen_vid = genai.upload_file(path=gen_vid_path, mime_type="video/mp4")
-    while True:
-        refreshed_1 = genai.get_file(gt_vid.name)
-        refreshed_2 = genai.get_file(gen_vid.name)
-        if refreshed_1.state.name == "ACTIVE" and refreshed_2.state.name == "ACTIVE": break
-        elif refreshed_1.state.name == "FAILED" or refreshed_2.state.name == "FAILED":
-            #raise RuntimeError("❌ File processing failed.")
-            return None
-        else:
-            print(f"waiting 5 seconds...")
-            time.sleep(5)
-    prompt_path = "./prompt/which_is_better.txt"
-    with open(prompt_path, 'r') as f: prompt = f.readlines()
-    prompt = "/n".join(prompt)
-    print("Sending prompt to Gemini...")
-    response = model.generate_content([prompt, refreshed_1, refreshed_2])
-    print("\n===== Evaluation Result =====")
-    print(response.text)
-    print("=============================\n")
-    return response.text
-_num_at_start = re.compile(r'^\s*["\']?(\d+)')
-def sort_by_leading_number(paths: List[str]) -> List[str]:
-    def key(p: str):
-        name = Path(p).name
-        m = _num_at_start.match(name)
-        return (int(m.group(1)) if m else float('inf'), name)
-    return sorted(paths, key=key)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
-    parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
-    parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
-    args = parser.parse_args()
-    ## load exist result if have
-    save_dir = args.save_dir
-    if path.basename(args.result_dir) == "paper2video":
-        save_dir = path.join(save_dir, path.basename(args.result_dir))
-    else: save_dir = path.join(save_dir, path.basename(args.result_dir))
-    save_path = path.join(save_dir, "video_arena.json")
-    os.makedirs(save_dir, exist_ok=True)
-    if path.exists(save_path):
-        with open(save_path, 'r') as f: arena_score_list = json.load(f)
-    else: arena_score_list = []
-    ## path
-    gt_dir, result_dir = args.gt_dir, args.result_dir
-    groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
-    result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
-    ## Generated v.s GT (1)
-    for index in tqdm(len(result_list)):
-        if path.basename(args.result_dir) == "paper2video":
-            test_video_path = path.join(result_list[index], "3_merage.mp4")
-        elif path.basename(args.result_dir) == 'veo3':
-            test_video_path = result_list[index]
-        else:
-            test_video_path = path.join(result_list[index], "result.mp4")
-        if path.exists(test_video_path) is False: continue
-        gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
-        if path.exists(gt_video_path) is False:
-            gt_video_path = path.join(groundtruth_list[index], "raw_video.mp4")
-            if path.exists(gt_video_path) is False: continue
-        result = eval_gemini(gt_video_path, test_video_path)
-        if result is None: continue
-        pat = r"\[(?:A|B)\]"
-        m = re.findall(pat, result, flags=re.I)
-        score = 0
-        if m[0][1] == "B": score += 1
-        result = eval_gemini(test_video_path, gt_video_path)
-        if result is None: continue
-        pat = r"\[(?:A|B)\]"
-        m = re.findall(pat, result, flags=re.I)
-        if m[0][1] == "A": score += 1
-        arena_score_list.append({
-            "data_idx": index,
-            "score": score/2
-        })
-        with open(save_path, 'w') as f: json.dump(arena_score_list, f, indent=4)

Paper2Video/src/evaluation/PresentQuiz/PresentQuiz.py DELETED Viewed

@@ -1,264 +0,0 @@
-import random
-import string
-import yaml
-import PIL
-import tempfile
-import io
-import argparse
-from os import path
-from camel.models import ModelFactory
-from math import ceil
-from openai import OpenAI
-from camel.messages import BaseMessage
-from utils.src.model_utils import parse_pdf
-from urllib.parse import unquote
-from copy import deepcopy
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from pytorch_fid.fid_score import compute_statistics_of_path
-import pytorch_fid.fid_score as fid
-from PIL import Image
-from httpx import Timeout
-from docling.document_converter import DocumentConverter, PdfFormatOption
-import re
-import shutil
-import pytesseract
-from utils.wei_utils import account_token
-from camel.types import ModelPlatformType, ModelType
-from marker.models import create_model_dict
-from camel.configs import ChatGPTConfig
-from camel.agents import ChatAgent
-from jinja2 import Environment, StrictUndefined
-from utils.src.utils import get_json_from_response
-from pathlib import Path
-from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
-from collections import defaultdict
-from camel.configs import ChatGPTConfig, QwenConfig, VLLMConfig, OpenRouterConfig, GeminiConfig
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption
-import math
-import base64
-import requests
-from io import BytesIO
-from PIL import Image
-import torch
-import json
-import os
-import pickle as pkl
-import numpy as np
-from transformers import AltCLIPProcessor, AltCLIPModel
-from pathlib import Path
-from typing import List
-from moviepy.editor import VideoFileClip
-os.environ["GEMINI_API_KEY"] = ""
-def compute_accuracy(predicted, ground_truth, aspects):
-    """
-    Parameters
-    ----------
-    predicted : dict
-        {question: {'answer': <letter>, 'reference': ...}, ...}
-    ground_truth : dict
-        {question: '<letter>. full answer', ...}
-    aspects : dict
-        {question: '<aspect name>', ...}
-    Returns
-    -------
-    overall_accuracy : float
-    aspect_summary : dict
-        {
-          '<aspect name>': {
-              'total':    <int>,   # questions in this aspect
-              'correct':  <int>,   # correctly answered questions
-              'accuracy': <float>  # correct / total (0–1)
-          },
-          ...
-        }
-    """
-    correct_global = 0
-    total_global   = len(ground_truth)
-    total_by_aspect   = defaultdict(int)
-    correct_by_aspect = defaultdict(int)
-    for q, pred_info in predicted.items():
-        letter_pred = pred_info['answer']
-        aspect = aspects.get(q, 'Unknown')
-        total_by_aspect[aspect] += 1
-        if q in ground_truth:
-            letter_gt = ground_truth[q].split('.')[0].strip()
-            if len(letter_pred) > 0:
-                letter_pred = letter_pred[0].upper()
-            if letter_pred == letter_gt:
-                correct_global += 1
-                correct_by_aspect[aspect] += 1
-    overall_accuracy = correct_global / total_global if total_global else 0.0
-    # Build the per-aspect dictionary
-    aspect_summary = {}
-    for aspect, total in total_by_aspect.items():
-        correct = correct_by_aspect[aspect]
-        acc     = correct / total if total else 0.0
-        aspect_summary[aspect] = {
-            'total':   total,
-            'correct': correct,
-            'accuracy': acc
-        }
-    return overall_accuracy, aspect_summary
-def eval_qa_get_answer(video_input, questions, answers, aspects, agent_config, input_type='video'):
-    agent_name = f'answer_question_from_{input_type}'
-    with open(f"prompt/{agent_name}.yaml", "r") as f: config = yaml.safe_load(f)
-    actor_model = ModelFactory.create(
-            model_platform=agent_config['model_platform'],
-            model_type=agent_config['model_type'],
-            model_config_dict=agent_config['model_config'],)
-    actor_sys_msg = config['system_prompt']
-    actor_agent = ChatAgent(system_message=actor_sys_msg, model=actor_model, message_window_size=None,)
-    actor_agent.reset()
-    jinja_env = Environment(undefined=StrictUndefined)
-    template = jinja_env.from_string(config["template"])
-    with open(video_input, "rb") as f: video_bytes = f.read()
-    if input_type == 'video':
-        prompt = template.render(**{'questions': questions,})
-        clip = VideoFileClip(video_input)
-        duration = clip.duration
-        msg = BaseMessage.make_user_message(
-            role_name="User",
-            content=prompt+"The video length is {}, you should NOT reference the timesteps if it exceeds video length".format(str(duration)),
-            video_bytes=video_bytes,
-            video_detail="low")
-        response = actor_agent.step(msg)
-        agent_answers = get_json_from_response(response.msgs[0].content)
-    input_token, output_token = account_token(response)
-    accuracy, aspect_accuracy = compute_accuracy(agent_answers, answers, aspects)
-    return accuracy, aspect_accuracy, agent_answers, input_token, output_token
-def run_qa_metric(question_path, video_path, result_path, test_model):
-    if test_model == "gemini":
-        agent_config = {
-                            "model_type": ModelType.GEMINI_2_5_FLASH,
-                            "model_config": GeminiConfig().as_dict(),
-                            "model_platform": ModelPlatformType.GEMINI,
-                        }
-    overall_qa_result = {"qa_result": {}}
-    qa_dict = json.load(open(question_path, 'r'))
-    detail_qa, understanding_qa = qa_dict['detail'], qa_dict['understanding']
-    input_token_all, output_token_all =0, 0
-    detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token = eval_qa_get_answer(
-            video_input=video_path,
-            questions=detail_qa['questions'],
-            answers=detail_qa['answers'],
-            aspects=detail_qa['aspects'],
-            agent_config=agent_config,
-            input_type='video')
-    input_token_all += input_token
-    output_token_all += output_token
-    understanding_accuracy, understanding_aspect_accuracy, understanding_agent_answers, input_token, output_token = eval_qa_get_answer(
-            video_input=video_path,
-            questions=understanding_qa['questions'],
-            answers=understanding_qa['answers'],
-            aspects=understanding_qa['aspects'],
-            agent_config=agent_config,
-            input_type='video')
-    input_token_all += input_token
-    output_token_all += output_token
-    overall_qa_result['qa_result'][test_model] = {
-            'detail_accuracy': detail_accuracy,
-            'detail_aspect_accuracy': detail_aspect_accuracy,
-            'detail_agent_answers': detail_agent_answers,
-            'understanding_accuracy': understanding_accuracy,
-            'understanding_aspect_accuracy': understanding_aspect_accuracy,
-            'understanding_agent_answers': understanding_agent_answers}
-    all_models_in_file = list(overall_qa_result['qa_result'].keys())
-    detail_accs = []
-    understanding_accs = []
-    for m in all_models_in_file:
-        detail_accs.append(overall_qa_result['qa_result'][m]['detail_accuracy'])
-        understanding_accs.append(overall_qa_result['qa_result'][m]['understanding_accuracy'])
-    avg_detail_accuracy = float(np.mean(detail_accs)) if detail_accs else 0.0
-    avg_understanding_accuracy = float(np.mean(understanding_accs)) if understanding_accs else 0.0
-    overall_qa_result['avg_detail_accuracy'] = avg_detail_accuracy
-    overall_qa_result['avg_understanding_accuracy'] = avg_understanding_accuracy
-    # Finally, overwrite the same JSON file with the updated results
-    with open(result_path, 'w') as f: json.dump(overall_qa_result, f, indent=4)
-    print(detail_accuracy, detail_aspect_accuracy, detail_agent_answers, input_token, output_token)
-_num_at_start = re.compile(r'^\s*["\']?(\d+)')
-def sort_by_leading_number(paths: List[str]) -> List[str]:
-    def key(p: str):
-        name = Path(p).name
-        m = _num_at_start.match(name)
-        return (int(m.group(1)) if m else float('inf'), name)
-    return sorted(paths, key=key)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-r", "--result_dir", default="/path/to/result")
-    parser.add_argument("-g", "--data_dir", default="/path/to/data")
-    parser.add_argument("-s", "--save_dir", default="/path/to/data")
-    args = parser.parse_args()
-    ## mkdirs
-    save_dir = args.save_dir
-    if path.basename(args.result_dir) == "paper2video":
-        save_dir = path.join(save_dir, path.basename(args.result_dir))
-    else: save_dir = path.join(save_dir, path.basename(args.result_dir))
-    save_path = path.join(save_dir, "qa_result")
-    os.makedirs(save_dir, exist_ok=True)
-    os.makedirs(save_path, exist_ok=True)
-    ## run test
-    gt_dir, result_dir = args.data_dir, args.result_dir
-    groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
-    if path.basename(args.result_dir) == "human_made": result_list = [] # from dataset
-    else: result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
-    start, end = 1, 100
-    for index in range(start, end):
-        qa_json_path = path.join(groundtruth_list[index], "4o-mini_qa.json")
-        ## paper2video
-        if path.basename(args.result_dir) == 'paper2video':
-            if without_presenter_flag is False:
-                test_video_path = path.join(result_list[index], "3_merage.mp4")
-            else:
-                test_video_path = path.join(result_list[index], "1_merage.mp4")
-            if path.exists(test_video_path) is False: continue
-        ## human made as baseline
-        elif path.basename(args.result_dir) == 'human_made':
-            test_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
-            if path.exists(test_video_path) is False:
-                test_video_path = path.join(groundtruth_list[index], "raw_video.mp4")
-        ## veo3
-        elif path.basename(args.result_dir) == 'veo3':
-            test_video_path = result_list[index]
-        elif path.basename(args.result_dir) == 'wan2.1':
-            test_video_path = path.join(result_list[index], "result.mp4")
-        ## presentagent
-        else:
-            test_video_path = path.join(result_list[index], "result.mp4")
-        if path.exists(test_video_path) is False: continue
-        result_save_path = path.join(save_path, "qa_result_{}.json".format(index))
-        print("start")
-        run_qa_metric(qa_json_path, test_video_path, result_save_path, 'gemini')

Paper2Video/src/evaluation/PresentQuiz/create_paper_questions.py DELETED Viewed

@@ -1,47 +0,0 @@
-from utils.poster_eval_utils import *
-import argparse
-import os
-import json
-os.environ["OPENAI_API_KEY"] = ""
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--paper_folder', type=str, default="path/to/data")
-    parser.add_argument('--model_name', type=str, default='4o')
-    args = parser.parse_args()
-    paper_text = get_poster_text(os.path.join(args.paper_folder, 'pdf', 'paper.pdf'))
-    if args.model_name == '4o':
-        model_type = ModelType.GPT_4O
-    elif args.model_name == 'o3':
-        model_type = ModelType.O3
-    elif args.model_name == 'gemini':
-        model_type = ModelType.GEMINI_2_5_PRO
-    detail_qa = get_questions(paper_text, 'detail', model_type)
-    understanding_qa = get_questions(paper_text, 'understanding', model_type)
-    detail_q, detail_a, detail_aspects = get_answers_and_remove_answers(detail_qa)
-    understanding_q, understanding_a, understanding_aspects = get_answers_and_remove_answers(understanding_qa)
-    final_qa = {}
-    detail_qa = {
-        'questions': detail_q,
-        'answers': detail_a,
-        'aspects': detail_aspects,
-    }
-    understanding_qa = {
-        'questions': understanding_q,
-        'answers': understanding_a,
-        'aspects': understanding_aspects,
-    }
-    final_qa['detail'] = detail_qa
-    final_qa['understanding'] = understanding_qa
-    with open(os.path.join(args.paper_folder, f'{args.model_name}_qa.json'), 'w') as f:
-        json.dump(final_qa, f, indent=4)

Paper2Video/src/evaluation/PresentQuiz/docling/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/backend/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/backend/abstract_backend.py DELETED Viewed

@@ -1,63 +0,0 @@
-from abc import ABC, abstractmethod
-from io import BytesIO
-from pathlib import Path
-from typing import TYPE_CHECKING, Set, Union
-from docling_core.types.doc import DoclingDocument
-if TYPE_CHECKING:
-    from docling.datamodel.base_models import InputFormat
-    from docling.datamodel.document import InputDocument
-class AbstractDocumentBackend(ABC):
-    @abstractmethod
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        self.file = in_doc.file
-        self.path_or_stream = path_or_stream
-        self.document_hash = in_doc.document_hash
-        self.input_format = in_doc.format
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-    @classmethod
-    @abstractmethod
-    def supports_pagination(cls) -> bool:
-        pass
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    @abstractmethod
-    def supported_formats(cls) -> Set["InputFormat"]:
-        pass
-class PaginatedDocumentBackend(AbstractDocumentBackend):
-    """DeclarativeDocumentBackend.
-    A declarative document backend is a backend that can transform to DoclingDocument
-    straight without a recognition pipeline.
-    """
-    @abstractmethod
-    def page_count(self) -> int:
-        pass
-class DeclarativeDocumentBackend(AbstractDocumentBackend):
-    """DeclarativeDocumentBackend.
-    A declarative document backend is a backend that can transform to DoclingDocument
-    straight without a recognition pipeline.
-    """
-    @abstractmethod
-    def convert(self) -> DoclingDocument:
-        pass

Paper2Video/src/evaluation/PresentQuiz/docling/backend/asciidoc_backend.py DELETED Viewed

@@ -1,430 +0,0 @@
-import logging
-import re
-from io import BytesIO
-from pathlib import Path
-from typing import Set, Union
-from docling_core.types.doc import (
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupItem,
-    GroupLabel,
-    ImageRef,
-    Size,
-    TableCell,
-    TableData,
-)
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class AsciiDocBackend(DeclarativeDocumentBackend):
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self.path_or_stream = path_or_stream
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue().decode("utf-8")
-                self.lines = text_stream.split("\n")
-            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
-                    self.lines = f.readlines()
-            self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
-            ) from e
-        return
-    def is_valid(self) -> bool:
-        return self.valid
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return False
-    def unload(self):
-        return
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.ASCIIDOC}
-    def convert(self) -> DoclingDocument:
-        """
-        Parses the ASCII into a structured document model.
-        """
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="text/asciidoc",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        doc = self._parse(doc)
-        return doc
-    def _parse(self, doc: DoclingDocument):
-        """
-        Main function that orchestrates the parsing by yielding components:
-        title, section headers, text, lists, and tables.
-        """
-        content = ""
-        in_list = False
-        in_table = False
-        text_data: list[str] = []
-        table_data: list[str] = []
-        caption_data: list[str] = []
-        # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
-        parents: dict[int, Union[GroupItem, None]] = {}
-        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
-        indents: dict[int, Union[GroupItem, None]] = {}
-        for i in range(0, 10):
-            parents[i] = None
-            indents[i] = None
-        for line in self.lines:
-            # line = line.strip()
-            # Title
-            if self._is_title(line):
-                item = self._parse_title(line)
-                level = item["level"]
-                parents[level] = doc.add_text(
-                    text=item["text"], label=DocItemLabel.TITLE
-                )
-            # Section headers
-            elif self._is_section_header(line):
-                item = self._parse_section_header(line)
-                level = item["level"]
-                parents[level] = doc.add_heading(
-                    text=item["text"], level=item["level"], parent=parents[level - 1]
-                )
-                for k, v in parents.items():
-                    if k > level:
-                        parents[k] = None
-            # Lists
-            elif self._is_list_item(line):
-                _log.debug(f"line: {line}")
-                item = self._parse_list_item(line)
-                _log.debug(f"parsed list-item: {item}")
-                level = self._get_current_level(parents)
-                if not in_list:
-                    in_list = True
-                    parents[level + 1] = doc.add_group(
-                        parent=parents[level], name="list", label=GroupLabel.LIST
-                    )
-                    indents[level + 1] = item["indent"]
-                elif in_list and item["indent"] > indents[level]:
-                    parents[level + 1] = doc.add_group(
-                        parent=parents[level], name="list", label=GroupLabel.LIST
-                    )
-                    indents[level + 1] = item["indent"]
-                elif in_list and item["indent"] < indents[level]:
-                    # print(item["indent"], " => ", indents[level])
-                    while item["indent"] < indents[level]:
-                        # print(item["indent"], " => ", indents[level])
-                        parents[level] = None
-                        indents[level] = None
-                        level -= 1
-                doc.add_list_item(
-                    item["text"], parent=self._get_current_parent(parents)
-                )
-            elif in_list and not self._is_list_item(line):
-                in_list = False
-                level = self._get_current_level(parents)
-                parents[level] = None
-            # Tables
-            elif line.strip() == "|===" and not in_table:  # start of table
-                in_table = True
-            elif self._is_table_line(line):  # within a table
-                in_table = True
-                table_data.append(self._parse_table_line(line))
-            elif in_table and (
-                (not self._is_table_line(line)) or line.strip() == "|==="
-            ):  # end of table
-                caption = None
-                if len(caption_data) > 0:
-                    caption = doc.add_text(
-                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
-                    )
-                caption_data = []
-                data = self._populate_table_as_grid(table_data)
-                doc.add_table(
-                    data=data, parent=self._get_current_parent(parents), caption=caption
-                )
-                in_table = False
-                table_data = []
-            # Picture
-            elif self._is_picture(line):
-                caption = None
-                if len(caption_data) > 0:
-                    caption = doc.add_text(
-                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
-                    )
-                caption_data = []
-                item = self._parse_picture(line)
-                size = None
-                if "width" in item and "height" in item:
-                    size = Size(width=int(item["width"]), height=int(item["height"]))
-                uri = None
-                if (
-                    "uri" in item
-                    and not item["uri"].startswith("http")
-                    and item["uri"].startswith("//")
-                ):
-                    uri = "file:" + item["uri"]
-                elif (
-                    "uri" in item
-                    and not item["uri"].startswith("http")
-                    and item["uri"].startswith("/")
-                ):
-                    uri = "file:/" + item["uri"]
-                elif "uri" in item and not item["uri"].startswith("http"):
-                    uri = "file://" + item["uri"]
-                image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
-                doc.add_picture(image=image, caption=caption)
-            # Caption
-            elif self._is_caption(line) and len(caption_data) == 0:
-                item = self._parse_caption(line)
-                caption_data.append(item["text"])
-            elif (
-                len(line.strip()) > 0 and len(caption_data) > 0
-            ):  # allow multiline captions
-                item = self._parse_text(line)
-                caption_data.append(item["text"])
-            # Plain text
-            elif len(line.strip()) == 0 and len(text_data) > 0:
-                doc.add_text(
-                    text=" ".join(text_data),
-                    label=DocItemLabel.PARAGRAPH,
-                    parent=self._get_current_parent(parents),
-                )
-                text_data = []
-            elif len(line.strip()) > 0:  # allow multiline texts
-                item = self._parse_text(line)
-                text_data.append(item["text"])
-        if len(text_data) > 0:
-            doc.add_text(
-                text=" ".join(text_data),
-                label=DocItemLabel.PARAGRAPH,
-                parent=self._get_current_parent(parents),
-            )
-            text_data = []
-        if in_table and len(table_data) > 0:
-            data = self._populate_table_as_grid(table_data)
-            doc.add_table(data=data, parent=self._get_current_parent(parents))
-            in_table = False
-            table_data = []
-        return doc
-    def _get_current_level(self, parents):
-        for k, v in parents.items():
-            if v == None and k > 0:
-                return k - 1
-        return 0
-    def _get_current_parent(self, parents):
-        for k, v in parents.items():
-            if v == None and k > 0:
-                return parents[k - 1]
-        return None
-    #   =========   Title
-    def _is_title(self, line):
-        return re.match(r"^= ", line)
-    def _parse_title(self, line):
-        return {"type": "title", "text": line[2:].strip(), "level": 0}
-    #   =========   Section headers
-    def _is_section_header(self, line):
-        return re.match(r"^==+", line)
-    def _parse_section_header(self, line):
-        match = re.match(r"^(=+)\s+(.*)", line)
-        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
-        text = match.group(2)  # The actual text of the list item
-        header_level = marker.count("=")  # number of '=' represents level
-        return {
-            "type": "header",
-            "level": header_level - 1,
-            "text": text.strip(),
-        }
-    #   =========   Lists
-    def _is_list_item(self, line):
-        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
-    def _parse_list_item(self, line):
-        """Extract the item marker (number or bullet symbol) and the text of the item."""
-        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
-        if match:
-            indent = match.group(1)
-            marker = match.group(2)  # The list marker (e.g., "*", "-", "1.")
-            text = match.group(3)  # The actual text of the list item
-            if marker == "*" or marker == "-":
-                return {
-                    "type": "list_item",
-                    "marker": marker,
-                    "text": text.strip(),
-                    "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
-                }
-            else:
-                return {
-                    "type": "list_item",
-                    "marker": marker,
-                    "text": text.strip(),
-                    "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
-                }
-        else:
-            # Fallback if no match
-            return {
-                "type": "list_item",
-                "marker": "-",
-                "text": line,
-                "numbered": False,
-                "indent": 0,
-            }
-    #   =========   Tables
-    def _is_table_line(self, line):
-        return re.match(r"^\|.*\|", line)
-    def _parse_table_line(self, line):
-        # Split table cells and trim extra spaces
-        return [cell.strip() for cell in line.split("|") if cell.strip()]
-    def _populate_table_as_grid(self, table_data):
-        num_rows = len(table_data)
-        # Adjust the table data into a grid format
-        num_cols = max(len(row) for row in table_data)
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-        for row_idx, row in enumerate(table_data):
-            # Pad rows with empty strings to match column count
-            # grid.append(row + [''] * (max_cols - len(row)))
-            for col_idx, text in enumerate(row):
-                row_span = 1
-                col_span = 1
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=False,
-                    row_header=False,
-                )
-                data.table_cells.append(cell)
-        return data
-    #   =========   Pictures
-    def _is_picture(self, line):
-        return re.match(r"^image::", line)
-    def _parse_picture(self, line):
-        """
-        Parse an image macro, extracting its path and attributes.
-        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
-        """
-        mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
-        if mtch:
-            picture_path = mtch.group(1).strip()
-            attributes = mtch.group(2).split(",")
-            picture_info = {"type": "picture", "uri": picture_path}
-            # Extract optional attributes (alt text, width, height, alignment)
-            if attributes:
-                picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
-                for attr in attributes[1:]:
-                    key, value = attr.split("=")
-                    picture_info[key.strip()] = value.strip()
-            return picture_info
-        return {"type": "picture", "uri": line}
-    #   =========   Captions
-    def _is_caption(self, line):
-        return re.match(r"^\.(.+)", line)
-    def _parse_caption(self, line):
-        mtch = re.match(r"^\.(.+)", line)
-        if mtch:
-            text = mtch.group(1)
-            return {"type": "caption", "text": text}
-        return {"type": "caption", "text": ""}
-    #   =========   Plain text
-    def _parse_text(self, line):
-        return {"type": "text", "text": line.strip()}

Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_backend.py DELETED Viewed

@@ -1,227 +0,0 @@
-import logging
-import random
-from io import BytesIO
-from pathlib import Path
-from typing import Iterable, List, Optional, Union
-import pypdfium2 as pdfium
-from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_parse.pdf_parsers import pdf_parser_v1
-from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class DoclingParsePageBackend(PdfPageBackend):
-    def __init__(
-        self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
-    ):
-        self._ppage = page_obj
-        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
-        self.valid = "pages" in parsed_page
-        if self.valid:
-            self._dpage = parsed_page["pages"][0]
-        else:
-            _log.info(
-                f"An error occurred when loading page {page_no} of document {document_hash}."
-            )
-    def is_valid(self) -> bool:
-        return self.valid
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if not self.valid:
-            return ""
-        # Find intersecting cells on the page
-        text_piece = ""
-        page_size = self.get_size()
-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
-        scale = (
-            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
-        )
-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
-            cell_bbox = BoundingBox(
-                l=x0 * scale * page_size.width / parser_width,
-                b=y0 * scale * page_size.height / parser_height,
-                r=x1 * scale * page_size.width / parser_width,
-                t=y1 * scale * page_size.height / parser_height,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
-            if overlap_frac > 0.5:
-                if len(text_piece) > 0:
-                    text_piece += " "
-                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
-        return text_piece
-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
-        cell_counter = 0
-        if not self.valid:
-            return cells
-        page_size = self.get_size()
-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
-            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
-            cells.append(
-                Cell(
-                    id=cell_counter,
-                    text=text_piece,
-                    bbox=BoundingBox(
-                        # l=x0, b=y0, r=x1, t=y1,
-                        l=x0 * page_size.width / parser_width,
-                        b=y0 * page_size.height / parser_height,
-                        r=x1 * page_size.width / parser_width,
-                        t=y1 * page_size.height / parser_height,
-                        coord_origin=CoordOrigin.BOTTOMLEFT,
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-        # before merge:
-        # draw_clusters_and_cells()
-        # cells = merge_horizontal_cells(cells)
-        # after merge:
-        # draw_clusters_and_cells()
-        return cells
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        for i in range(len(self._dpage["images"])):
-            bitmap = self._dpage["images"][i]
-            cropbox = BoundingBox.from_tuple(
-                bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
-            ).to_top_left_origin(self.get_size().height)
-            if cropbox.area() > AREA_THRESHOLD:
-                cropbox = cropbox.scaled(scale=scale)
-                yield cropbox
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
-    ) -> Image.Image:
-        page_size = self.get_size()
-        if not cropbox:
-            cropbox = BoundingBox(
-                l=0,
-                r=page_size.width,
-                t=0,
-                b=page_size.height,
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-            padbox = BoundingBox(
-                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
-            )
-        else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
-            padbox.r = page_size.width - padbox.r
-            padbox.t = page_size.height - padbox.t
-        image = (
-            self._ppage.render(
-                scale=scale * 1.5,
-                rotation=0,  # no additional rotation
-                crop=padbox.as_tuple(),
-            )
-            .to_pil()
-            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
-        )  # We resize the image from 1.5x the given scale to make it sharper.
-        return image
-    def get_size(self) -> Size:
-        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
-    def unload(self):
-        self._ppage = None
-        self._dpage = None
-class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
-        self.parser = pdf_parser_v1()
-        success = False
-        if isinstance(self.path_or_stream, BytesIO):
-            success = self.parser.load_document_from_bytesio(
-                self.document_hash, self.path_or_stream
-            )
-        elif isinstance(self.path_or_stream, Path):
-            success = self.parser.load_document(
-                self.document_hash, str(self.path_or_stream)
-            )
-        if not success:
-            raise RuntimeError(
-                f"docling-parse could not load document with hash {self.document_hash}."
-            )
-    def page_count(self) -> int:
-        return len(self._pdoc)  # To be replaced with docling-parse API
-    def load_page(self, page_no: int) -> DoclingParsePageBackend:
-        return DoclingParsePageBackend(
-            self.parser, self.document_hash, page_no, self._pdoc[page_no]
-        )
-    def is_valid(self) -> bool:
-        return self.page_count() > 0
-    def unload(self):
-        super().unload()
-        self.parser.unload_document(self.document_hash)
-        self._pdoc.close()
-        self._pdoc = None

Paper2Video/src/evaluation/PresentQuiz/docling/backend/docling_parse_v2_backend.py DELETED Viewed

@@ -1,250 +0,0 @@
-import logging
-import random
-from io import BytesIO
-from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
-import pypdfium2 as pdfium
-from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_parse.pdf_parsers import pdf_parser_v2
-from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell, Size
-if TYPE_CHECKING:
-    from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class DoclingParseV2PageBackend(PdfPageBackend):
-    def __init__(
-        self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
-    ):
-        self._ppage = page_obj
-        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
-        self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
-        if self.valid:
-            self._dpage = parsed_page["pages"][0]
-        else:
-            _log.info(
-                f"An error occurred when loading page {page_no} of document {document_hash}."
-            )
-    def is_valid(self) -> bool:
-        return self.valid
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if not self.valid:
-            return ""
-        # Find intersecting cells on the page
-        text_piece = ""
-        page_size = self.get_size()
-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
-        scale = (
-            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
-        )
-        cells_data = self._dpage["sanitized"]["cells"]["data"]
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
-        for i, cell_data in enumerate(cells_data):
-            x0 = cell_data[cells_header.index("x0")]
-            y0 = cell_data[cells_header.index("y0")]
-            x1 = cell_data[cells_header.index("x1")]
-            y1 = cell_data[cells_header.index("y1")]
-            cell_bbox = BoundingBox(
-                l=x0 * scale * page_size.width / parser_width,
-                b=y0 * scale * page_size.height / parser_height,
-                r=x1 * scale * page_size.width / parser_width,
-                t=y1 * scale * page_size.height / parser_height,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            ).to_top_left_origin(page_height=page_size.height * scale)
-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
-            if overlap_frac > 0.5:
-                if len(text_piece) > 0:
-                    text_piece += " "
-                text_piece += cell_data[cells_header.index("text")]
-        return text_piece
-    def get_text_cells(self) -> Iterable[Cell]:
-        cells: List[Cell] = []
-        cell_counter = 0
-        if not self.valid:
-            return cells
-        page_size = self.get_size()
-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
-        cells_data = self._dpage["sanitized"]["cells"]["data"]
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
-        for i, cell_data in enumerate(cells_data):
-            x0 = cell_data[cells_header.index("x0")]
-            y0 = cell_data[cells_header.index("y0")]
-            x1 = cell_data[cells_header.index("x1")]
-            y1 = cell_data[cells_header.index("y1")]
-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
-            text_piece = cell_data[cells_header.index("text")]
-            cells.append(
-                Cell(
-                    id=cell_counter,
-                    text=text_piece,
-                    bbox=BoundingBox(
-                        # l=x0, b=y0, r=x1, t=y1,
-                        l=x0 * page_size.width / parser_width,
-                        b=y0 * page_size.height / parser_height,
-                        r=x1 * page_size.width / parser_width,
-                        t=y1 * page_size.height / parser_height,
-                        coord_origin=CoordOrigin.BOTTOMLEFT,
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-        # draw_clusters_and_cells()
-        return cells
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        images = self._dpage["sanitized"]["images"]["data"]
-        images_header = self._dpage["sanitized"]["images"]["header"]
-        for row in images:
-            x0 = row[images_header.index("x0")]
-            y0 = row[images_header.index("y0")]
-            x1 = row[images_header.index("x1")]
-            y1 = row[images_header.index("y1")]
-            cropbox = BoundingBox.from_tuple(
-                (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
-            ).to_top_left_origin(self.get_size().height)
-            if cropbox.area() > AREA_THRESHOLD:
-                cropbox = cropbox.scaled(scale=scale)
-                yield cropbox
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
-    ) -> Image.Image:
-        page_size = self.get_size()
-        if not cropbox:
-            cropbox = BoundingBox(
-                l=0,
-                r=page_size.width,
-                t=0,
-                b=page_size.height,
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-            padbox = BoundingBox(
-                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
-            )
-        else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
-            padbox.r = page_size.width - padbox.r
-            padbox.t = page_size.height - padbox.t
-        image = (
-            self._ppage.render(
-                scale=scale * 1.5,
-                rotation=0,  # no additional rotation
-                crop=padbox.as_tuple(),
-            )
-            .to_pil()
-            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
-        )  # We resize the image from 1.5x the given scale to make it sharper.
-        return image
-    def get_size(self) -> Size:
-        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
-    def unload(self):
-        self._ppage = None
-        self._dpage = None
-class DoclingParseV2DocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
-        self.parser = pdf_parser_v2("fatal")
-        success = False
-        if isinstance(self.path_or_stream, BytesIO):
-            success = self.parser.load_document_from_bytesio(
-                self.document_hash, self.path_or_stream
-            )
-        elif isinstance(self.path_or_stream, Path):
-            success = self.parser.load_document(
-                self.document_hash, str(self.path_or_stream)
-            )
-        if not success:
-            raise RuntimeError(
-                f"docling-parse v2 could not load document {self.document_hash}."
-            )
-    def page_count(self) -> int:
-        # return len(self._pdoc)  # To be replaced with docling-parse API
-        len_1 = len(self._pdoc)
-        len_2 = self.parser.number_of_pages(self.document_hash)
-        if len_1 != len_2:
-            _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
-        return len_2
-    def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
-        return DoclingParseV2PageBackend(
-            self.parser, self.document_hash, page_no, self._pdoc[page_no]
-        )
-    def is_valid(self) -> bool:
-        return self.page_count() > 0
-    def unload(self):
-        super().unload()
-        self.parser.unload_document(self.document_hash)
-        self._pdoc.close()
-        self._pdoc = None

Paper2Video/src/evaluation/PresentQuiz/docling/backend/html_backend.py DELETED Viewed

@@ -1,442 +0,0 @@
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import Optional, Set, Union
-from bs4 import BeautifulSoup, Tag
-from docling_core.types.doc import (
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    TableCell,
-    TableData,
-)
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class HTMLDocumentBackend(DeclarativeDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        _log.debug("About to init HTML backend...")
-        self.soup: Optional[Tag] = None
-        # HTML file:
-        self.path_or_stream = path_or_stream
-        # Initialise the parents for the hierarchy
-        self.max_levels = 10
-        self.level = 0
-        self.parents = {}  # type: ignore
-        for i in range(0, self.max_levels):
-            self.parents[i] = None
-        self.labels = {}  # type: ignore
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue()
-                self.soup = BeautifulSoup(text_stream, "html.parser")
-            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "rb") as f:
-                    html_content = f.read()
-                    self.soup = BeautifulSoup(html_content, "html.parser")
-        except Exception as e:
-            raise RuntimeError(
-                f"Could not initialize HTML backend for file with hash {self.document_hash}."
-            ) from e
-    def is_valid(self) -> bool:
-        return self.soup is not None
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return False
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.HTML}
-    def convert(self) -> DoclingDocument:
-        # access self.path_or_stream to load stuff
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="text/html",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        _log.debug("Trying to convert HTML...")
-        if self.is_valid():
-            assert self.soup is not None
-            content = self.soup.body or self.soup
-            # Replace <br> tags with newline characters
-            for br in content.find_all("br"):
-                br.replace_with("\n")
-            doc = self.walk(content, doc)
-        else:
-            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
-            )
-        return doc
-    def walk(self, element: Tag, doc: DoclingDocument):
-        try:
-            # Iterate over elements in the body of the document
-            for idx, element in enumerate(element.children):
-                try:
-                    self.analyse_element(element, idx, doc)
-                except Exception as exc_child:
-                    _log.error(" -> error treating child: ", exc_child)
-                    _log.error(" => element: ", element, "\n")
-                    raise exc_child
-        except Exception as exc:
-            pass
-        return doc
-    def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
-        """
-        if element.name!=None:
-            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
-        """
-        if element.name in self.labels:
-            self.labels[element.name] += 1
-        else:
-            self.labels[element.name] = 1
-        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            self.handle_header(element, idx, doc)
-        elif element.name in ["p"]:
-            self.handle_paragraph(element, idx, doc)
-        elif element.name in ["pre"]:
-            self.handle_code(element, idx, doc)
-        elif element.name in ["ul", "ol"]:
-            self.handle_list(element, idx, doc)
-        elif element.name in ["li"]:
-            self.handle_listitem(element, idx, doc)
-        elif element.name == "table":
-            self.handle_table(element, idx, doc)
-        elif element.name == "figure":
-            self.handle_figure(element, idx, doc)
-        elif element.name == "img":
-            self.handle_image(element, idx, doc)
-        else:
-            self.walk(element, doc)
-    def get_direct_text(self, item: Tag):
-        """Get the direct text of the <li> element (ignoring nested lists)."""
-        text = item.find(string=True, recursive=False)
-        if isinstance(text, str):
-            return text.strip()
-        return ""
-    # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item: Tag):
-        result = []
-        if isinstance(item, str):
-            return [item]
-        if item.name not in ["ul", "ol"]:
-            try:
-                # Iterate over the children (and their text and tails)
-                for child in item:
-                    try:
-                        # Recursively get the child's text content
-                        result.extend(self.extract_text_recursively(child))
-                    except:
-                        pass
-            except:
-                _log.warn("item has no children")
-                pass
-        return "".join(result) + " "
-    def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles header tags (h1, h2, etc.)."""
-        hlevel = int(element.name.replace("h", ""))
-        slevel = hlevel - 1
-        label = DocItemLabel.SECTION_HEADER
-        text = element.text.strip()
-        if hlevel == 1:
-            for key, val in self.parents.items():
-                self.parents[key] = None
-            self.level = 1
-            self.parents[self.level] = doc.add_text(
-                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
-            )
-        else:
-            if hlevel > self.level:
-                # add invisible group
-                for i in range(self.level + 1, hlevel):
-                    self.parents[i] = doc.add_group(
-                        name=f"header-{i}",
-                        label=GroupLabel.SECTION,
-                        parent=self.parents[i - 1],
-                    )
-                self.level = hlevel
-            elif hlevel < self.level:
-                # remove the tail
-                for key, val in self.parents.items():
-                    if key > hlevel:
-                        self.parents[key] = None
-                self.level = hlevel
-            self.parents[hlevel] = doc.add_heading(
-                parent=self.parents[hlevel - 1],
-                text=text,
-                level=hlevel,
-            )
-    def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles monospace code snippets (pre)."""
-        if element.text is None:
-            return
-        text = element.text.strip()
-        label = DocItemLabel.CODE
-        if len(text) == 0:
-            return
-        doc.add_code(parent=self.parents[self.level], text=text)
-    def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles paragraph tags (p)."""
-        if element.text is None:
-            return
-        text = element.text.strip()
-        label = DocItemLabel.PARAGRAPH
-        if len(text) == 0:
-            return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
-    def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles list tags (ul, ol) and their list items."""
-        if element.name == "ul":
-            # create a list group
-            self.parents[self.level + 1] = doc.add_group(
-                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
-            )
-        elif element.name == "ol":
-            # create a list group
-            self.parents[self.level + 1] = doc.add_group(
-                parent=self.parents[self.level],
-                name="ordered list",
-                label=GroupLabel.ORDERED_LIST,
-            )
-        self.level += 1
-        self.walk(element, doc)
-        self.parents[self.level + 1] = None
-        self.level -= 1
-    def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles listitem tags (li)."""
-        nested_lists = element.find(["ul", "ol"])
-        parent_list_label = self.parents[self.level].label
-        index_in_list = len(self.parents[self.level].children) + 1
-        if nested_lists:
-            name = element.name
-            # Text in list item can be hidden within hierarchy, hence
-            # we need to extract it recursively
-            text = self.extract_text_recursively(element)
-            # Flatten text, remove break lines:
-            text = text.replace("\n", "").replace("\r", "")
-            text = " ".join(text.split()).strip()
-            marker = ""
-            enumerated = False
-            if parent_list_label == GroupLabel.ORDERED_LIST:
-                marker = str(index_in_list)
-                enumerated = True
-            if len(text) > 0:
-                # create a list-item
-                self.parents[self.level + 1] = doc.add_list_item(
-                    text=text,
-                    enumerated=enumerated,
-                    marker=marker,
-                    parent=self.parents[self.level],
-                )
-                self.level += 1
-            self.walk(element, doc)
-            self.parents[self.level + 1] = None
-            self.level -= 1
-        elif isinstance(element.text, str):
-            text = element.text.strip()
-            marker = ""
-            enumerated = False
-            if parent_list_label == GroupLabel.ORDERED_LIST:
-                marker = f"{str(index_in_list)}."
-                enumerated = True
-            doc.add_list_item(
-                text=text,
-                enumerated=enumerated,
-                marker=marker,
-                parent=self.parents[self.level],
-            )
-        else:
-            _log.warn("list-item has no text: ", element)
-    def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles table tags."""
-        nested_tables = element.find("table")
-        if nested_tables is not None:
-            _log.warn("detected nested tables: skipping for now")
-            return
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(element.find_all("tr"))
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in element.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(element.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-            col_idx = 0
-            # Extract and print the text content of each cell
-            for _, html_cell in enumerate(cells):
-                text = html_cell.text
-                try:
-                    text = self.extract_table_cell_text(html_cell)
-                except Exception as exc:
-                    _log.warn("exception: ", exc)
-                    exit(-1)
-                # label = html_cell.name
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-                while grid[row_idx][col_idx] is not None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
-        doc.add_table(data=data, parent=self.parents[self.level])
-    def get_list_text(self, list_element: Tag, level=0):
-        """Recursively extract text from <ul> or <ol> with proper indentation."""
-        result = []
-        bullet_char = "*"  # Default bullet character for unordered lists
-        if list_element.name == "ol":  # For ordered lists, use numbers
-            for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
-                # Add numbering for ordered lists
-                result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
-                # Handle nested lists
-                nested_list = li.find(["ul", "ol"])
-                if nested_list:
-                    result.extend(self.get_list_text(nested_list, level + 1))
-        elif list_element.name == "ul":  # For unordered lists, use bullet points
-            for li in list_element.find_all("li", recursive=False):
-                # Add bullet points for unordered lists
-                result.append(
-                    f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
-                )
-                # Handle nested lists
-                nested_list = li.find(["ul", "ol"])
-                if nested_list:
-                    result.extend(self.get_list_text(nested_list, level + 1))
-        return result
-    def extract_table_cell_text(self, cell: Tag):
-        """Extract text from a table cell, including lists with indents."""
-        contains_lists = cell.find(["ul", "ol"])
-        if contains_lists is None:
-            return cell.text
-        else:
-            _log.debug(
-                "should extract the content correctly for table-cells with lists ..."
-            )
-            return cell.text
-    def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
-        """Handles image tags (img)."""
-        # Extract the image URI from the <img> tag
-        # image_uri = root.xpath('//figure//img/@src')[0]
-        contains_captions = element.find(["figcaption"])
-        if contains_captions is None:
-            doc.add_picture(parent=self.parents[self.level], caption=None)
-        else:
-            texts = []
-            for item in contains_captions:
-                texts.append(item.text)
-            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
-            )
-            doc.add_picture(
-                parent=self.parents[self.level],
-                caption=fig_caption,
-            )
-    def handle_image(self, element: Tag, idx, doc: DoclingDocument):
-        """Handles image tags (img)."""
-        doc.add_picture(parent=self.parents[self.level], caption=None)

Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/backend/json/docling_json_backend.py DELETED Viewed

@@ -1,58 +0,0 @@
-from io import BytesIO
-from pathlib import Path
-from typing import Union
-from docling_core.types.doc import DoclingDocument
-from typing_extensions import override
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-class DoclingJSONBackend(DeclarativeDocumentBackend):
-    @override
-    def __init__(
-        self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
-    ) -> None:
-        super().__init__(in_doc, path_or_stream)
-        # given we need to store any actual conversion exception for raising it from
-        # convert(), this captures the successful result or the actual error in a
-        # mutually exclusive way:
-        self._doc_or_err = self._get_doc_or_err()
-    @override
-    def is_valid(self) -> bool:
-        return isinstance(self._doc_or_err, DoclingDocument)
-    @classmethod
-    @override
-    def supports_pagination(cls) -> bool:
-        return False
-    @classmethod
-    @override
-    def supported_formats(cls) -> set[InputFormat]:
-        return {InputFormat.JSON_DOCLING}
-    def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
-        try:
-            json_data: Union[str, bytes]
-            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, encoding="utf-8") as f:
-                    json_data = f.read()
-            elif isinstance(self.path_or_stream, BytesIO):
-                json_data = self.path_or_stream.getvalue()
-            else:
-                raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
-            return DoclingDocument.model_validate_json(json_data=json_data)
-        except Exception as e:
-            return e
-    @override
-    def convert(self) -> DoclingDocument:
-        if isinstance(self._doc_or_err, DoclingDocument):
-            return self._doc_or_err
-        else:
-            raise self._doc_or_err

Paper2Video/src/evaluation/PresentQuiz/docling/backend/md_backend.py DELETED Viewed

@@ -1,428 +0,0 @@
-import logging
-import re
-import warnings
-from io import BytesIO
-from pathlib import Path
-from typing import List, Optional, Set, Union
-import marko
-import marko.element
-import marko.ext
-import marko.ext.gfm
-import marko.inline
-from docling_core.types.doc import (
-    DocItem,
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    NodeItem,
-    TableCell,
-    TableData,
-    TextItem,
-)
-from marko import Markdown
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.backend.html_backend import HTMLDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
-_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
-_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
-class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
-        # This regex will match any sequence of underscores
-        pattern = r"_+"
-        def replace_match(match):
-            underscore_sequence = match.group(
-                0
-            )  # Get the full match (sequence of underscores)
-            # Shorten the sequence if it exceeds max_length
-            if len(underscore_sequence) > max_length:
-                return "_" * max_length
-            else:
-                return underscore_sequence  # Leave it unchanged if it is shorter or equal to max_length
-        # Use re.sub to replace long underscore sequences
-        shortened_text = re.sub(pattern, replace_match, markdown_text)
-        if len(shortened_text) != len(markdown_text):
-            warnings.warn("Detected potentially incorrect Markdown, correcting...")
-        return shortened_text
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        _log.debug("MD INIT!!!")
-        # Markdown file:
-        self.path_or_stream = path_or_stream
-        self.valid = True
-        self.markdown = ""  # To store original Markdown string
-        self.in_table = False
-        self.md_table_buffer: list[str] = []
-        self.inline_texts: list[str] = []
-        self._html_blocks: int = 0
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue().decode("utf-8")
-                # remove invalid sequences
-                # very long sequences of underscores will lead to unnecessary long processing times.
-                # In any proper Markdown files, underscores have to be escaped,
-                # otherwise they represent emphasis (bold or italic)
-                self.markdown = self._shorten_underscore_sequences(text_stream)
-            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
-                    md_content = f.read()
-                    # remove invalid sequences
-                    # very long sequences of underscores will lead to unnecessary long processing times.
-                    # In any proper Markdown files, underscores have to be escaped,
-                    # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self._shorten_underscore_sequences(md_content)
-            self.valid = True
-            _log.debug(self.markdown)
-        except Exception as e:
-            raise RuntimeError(
-                f"Could not initialize MD backend for file with hash {self.document_hash}."
-            ) from e
-        return
-    def _close_table(self, doc: DoclingDocument):
-        if self.in_table:
-            _log.debug("=== TABLE START ===")
-            for md_table_row in self.md_table_buffer:
-                _log.debug(md_table_row)
-            _log.debug("=== TABLE END ===")
-            tcells: List[TableCell] = []
-            result_table = []
-            for n, md_table_row in enumerate(self.md_table_buffer):
-                data = []
-                if n == 0:
-                    header = [t.strip() for t in md_table_row.split("|")[1:-1]]
-                    for value in header:
-                        data.append(value)
-                    result_table.append(data)
-                if n > 1:
-                    values = [t.strip() for t in md_table_row.split("|")[1:-1]]
-                    for value in values:
-                        data.append(value)
-                    result_table.append(data)
-            for trow_ind, trow in enumerate(result_table):
-                for tcol_ind, cellval in enumerate(trow):
-                    row_span = (
-                        1  # currently supporting just simple tables (without spans)
-                    )
-                    col_span = (
-                        1  # currently supporting just simple tables (without spans)
-                    )
-                    icell = TableCell(
-                        text=cellval.strip(),
-                        row_span=row_span,
-                        col_span=col_span,
-                        start_row_offset_idx=trow_ind,
-                        end_row_offset_idx=trow_ind + row_span,
-                        start_col_offset_idx=tcol_ind,
-                        end_col_offset_idx=tcol_ind + col_span,
-                        col_header=False,
-                        row_header=False,
-                    )
-                    tcells.append(icell)
-            num_rows = len(result_table)
-            num_cols = len(result_table[0])
-            self.in_table = False
-            self.md_table_buffer = []  # clean table markdown buffer
-            # Initialize Docling TableData
-            table_data = TableData(
-                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
-            )
-            # Populate
-            for tcell in tcells:
-                table_data.table_cells.append(tcell)
-            if len(tcells) > 0:
-                doc.add_table(data=table_data)
-        return
-    def _process_inline_text(
-        self, parent_item: Optional[NodeItem], doc: DoclingDocument
-    ):
-        txt = " ".join(self.inline_texts)
-        if len(txt) > 0:
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH,
-                parent=parent_item,
-                text=txt,
-            )
-        self.inline_texts = []
-    def _iterate_elements(
-        self,
-        element: marko.element.Element,
-        depth: int,
-        doc: DoclingDocument,
-        visited: Set[marko.element.Element],
-        parent_item: Optional[NodeItem] = None,
-    ):
-        if element in visited:
-            return
-        # Iterates over all elements in the AST
-        # Check for different element types and process relevant details
-        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
-            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
-            _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
-            )
-            if element.level == 1:
-                doc_label = DocItemLabel.TITLE
-            else:
-                doc_label = DocItemLabel.SECTION_HEADER
-            # Header could have arbitrary inclusion of bold, italic or emphasis,
-            # hence we need to traverse the tree to get full text of a header
-            strings: List[str] = []
-            # Define a recursive function to traverse the tree
-            def traverse(node: marko.block.BlockElement):
-                # Check if the node has a "children" attribute
-                if hasattr(node, "children"):
-                    # If "children" is a list, continue traversal
-                    if isinstance(node.children, list):
-                        for child in node.children:
-                            traverse(child)
-                    # If "children" is text, add it to header text
-                    elif isinstance(node.children, str):
-                        strings.append(node.children)
-            traverse(element)
-            snippet_text = "".join(strings)
-            if len(snippet_text) > 0:
-                parent_item = doc.add_text(
-                    label=doc_label, parent=parent_item, text=snippet_text
-                )
-        elif isinstance(element, marko.block.List):
-            has_non_empty_list_items = False
-            for child in element.children:
-                if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
-                    has_non_empty_list_items = True
-                    break
-            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
-            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
-            if has_non_empty_list_items:
-                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
-                parent_item = doc.add_group(
-                    label=label, name=f"list", parent=parent_item
-                )
-        elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
-            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
-            _log.debug(" - List item")
-            first_child = element.children[0]
-            snippet_text = str(first_child.children[0].children)  # type: ignore
-            is_numbered = False
-            if (
-                parent_item is not None
-                and isinstance(parent_item, DocItem)
-                and parent_item.label == GroupLabel.ORDERED_LIST
-            ):
-                is_numbered = True
-            doc.add_list_item(
-                enumerated=is_numbered, parent=parent_item, text=snippet_text
-            )
-            visited.add(first_child)
-        elif isinstance(element, marko.inline.Image):
-            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
-            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            fig_caption: Optional[TextItem] = None
-            if element.title is not None and element.title != "":
-                fig_caption = doc.add_text(
-                    label=DocItemLabel.CAPTION, text=element.title
-                )
-            doc.add_picture(parent=parent_item, caption=fig_caption)
-        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self._process_inline_text(parent_item, doc)
-        elif isinstance(element, marko.inline.RawText):
-            _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = element.children.strip()
-            # Detect start of the table:
-            if "|" in snippet_text:
-                # most likely part of the markdown table
-                self.in_table = True
-                if len(self.md_table_buffer) > 0:
-                    self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
-                else:
-                    self.md_table_buffer.append(snippet_text)
-            else:
-                self._close_table(doc)
-                # most likely just inline text
-                self.inline_texts.append(str(element.children))
-        elif isinstance(element, marko.inline.CodeSpan):
-            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
-            _log.debug(f" - Code Span: {element.children}")
-            snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_item, text=snippet_text)
-        elif (
-            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
-            and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.inline.RawText)
-            and len(snippet_text := (first_child.children.strip())) > 0
-        ):
-            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
-            _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_item, text=snippet_text)
-        elif isinstance(element, marko.inline.LineBreak):
-            if self.in_table:
-                _log.debug("Line break in a table")
-                self.md_table_buffer.append("")
-        elif isinstance(element, marko.block.HTMLBlock):
-            self._html_blocks += 1
-            self._process_inline_text(parent_item, doc)
-            self._close_table(doc)
-            _log.debug("HTML Block: {}".format(element))
-            if (
-                len(element.body) > 0
-            ):  # If Marko doesn't return any content for HTML block, skip it
-                html_block = element.body.strip()
-                # wrap in markers to enable post-processing in convert()
-                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_item, text=text_to_add)
-        else:
-            if not isinstance(element, str):
-                self._close_table(doc)
-                _log.debug("Some other element: {}".format(element))
-        processed_block_types = (
-            marko.block.Heading,
-            marko.block.CodeBlock,
-            marko.block.FencedCode,
-            marko.inline.RawText,
-        )
-        # Iterate through the element's children (if any)
-        if hasattr(element, "children") and not isinstance(
-            element, processed_block_types
-        ):
-            for child in element.children:
-                self._iterate_elements(
-                    element=child,
-                    depth=depth + 1,
-                    doc=doc,
-                    visited=visited,
-                    parent_item=parent_item,
-                )
-    def is_valid(self) -> bool:
-        return self.valid
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return False
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.MD}
-    def convert(self) -> DoclingDocument:
-        _log.debug("converting Markdown...")
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="text/markdown",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        if self.is_valid():
-            # Parse the markdown into an abstract syntax tree (AST)
-            marko_parser = Markdown()
-            parsed_ast = marko_parser.parse(self.markdown)
-            # Start iterating from the root of the AST
-            self._iterate_elements(
-                element=parsed_ast,
-                depth=0,
-                doc=doc,
-                parent_item=None,
-                visited=set(),
-            )
-            self._process_inline_text(None, doc)  # handle last hanging inline text
-            self._close_table(doc=doc)  # handle any last hanging table
-            # if HTML blocks were detected, export to HTML and delegate to HTML backend
-            if self._html_blocks > 0:
-                # export to HTML
-                html_backend_cls = HTMLDocumentBackend
-                html_str = doc.export_to_html()
-                def _restore_original_html(txt, regex):
-                    _txt, count = re.subn(regex, "", txt)
-                    if count != self._html_blocks:
-                        raise RuntimeError(
-                            "An internal error has occurred during Markdown conversion."
-                        )
-                    return _txt
-                # restore original HTML by removing previouly added markers
-                for regex in [
-                    rf"<pre>\s*<code>\s*{_START_MARKER}",
-                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
-                ]:
-                    html_str = _restore_original_html(txt=html_str, regex=regex)
-                self._html_blocks = 0
-                # delegate to HTML backend
-                stream = BytesIO(bytes(html_str, encoding="utf-8"))
-                in_doc = InputDocument(
-                    path_or_stream=stream,
-                    format=InputFormat.HTML,
-                    backend=html_backend_cls,
-                    filename=self.file.name,
-                )
-                html_backend_obj = html_backend_cls(
-                    in_doc=in_doc, path_or_stream=stream
-                )
-                doc = html_backend_obj.convert()
-        else:
-            raise RuntimeError(
-                f"Cannot convert md with {self.document_hash} because the backend failed to init."
-            )
-        return doc

Paper2Video/src/evaluation/PresentQuiz/docling/backend/msexcel_backend.py DELETED Viewed

@@ -1,386 +0,0 @@
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import Dict, Set, Tuple, Union
-from docling_core.types.doc import (
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    ImageRef,
-    TableCell,
-    TableData,
-)
-# from lxml import etree
-from openpyxl import Workbook, load_workbook
-from openpyxl.cell.cell import Cell
-from openpyxl.drawing.image import Image
-from openpyxl.worksheet.worksheet import Worksheet
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-from typing import Any, List
-from PIL import Image as PILImage
-from pydantic import BaseModel
-class ExcelCell(BaseModel):
-    row: int
-    col: int
-    text: str
-    row_span: int
-    col_span: int
-class ExcelTable(BaseModel):
-    num_rows: int
-    num_cols: int
-    data: List[ExcelCell]
-class MsExcelDocumentBackend(DeclarativeDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        # Initialise the parents for the hierarchy
-        self.max_levels = 10
-        self.parents: Dict[int, Any] = {}
-        for i in range(-1, self.max_levels):
-            self.parents[i] = None
-        self.workbook = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.workbook = load_workbook(filename=self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.workbook = load_workbook(filename=str(self.path_or_stream))
-            self.valid = True
-        except Exception as e:
-            self.valid = False
-            raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e
-    def is_valid(self) -> bool:
-        _log.info(f"valid: {self.valid}")
-        return self.valid
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return True
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.XLSX}
-    def convert(self) -> DoclingDocument:
-        # Parses the XLSX into a structured document model.
-        origin = DocumentOrigin(
-            filename=self.file.name or "file.xlsx",
-            mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
-        if self.is_valid():
-            doc = self._convert_workbook(doc)
-        else:
-            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
-            )
-        return doc
-    def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
-        if self.workbook is not None:
-            # Iterate over all sheets
-            for sheet_name in self.workbook.sheetnames:
-                _log.info(f"Processing sheet: {sheet_name}")
-                # Access the sheet by name
-                sheet = self.workbook[sheet_name]
-                self.parents[0] = doc.add_group(
-                    parent=None,
-                    label=GroupLabel.SECTION,
-                    name=f"sheet: {sheet_name}",
-                )
-                doc = self._convert_sheet(doc, sheet)
-        else:
-            _log.error("Workbook is not initialized.")
-        return doc
-    def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
-        doc = self._find_tables_in_sheet(doc, sheet)
-        doc = self._find_images_in_sheet(doc, sheet)
-        return doc
-    def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
-        tables = self._find_data_tables(sheet)
-        for excel_table in tables:
-            num_rows = excel_table.num_rows
-            num_cols = excel_table.num_cols
-            table_data = TableData(
-                num_rows=num_rows,
-                num_cols=num_cols,
-                table_cells=[],
-            )
-            for excel_cell in excel_table.data:
-                cell = TableCell(
-                    text=excel_cell.text,
-                    row_span=excel_cell.row_span,
-                    col_span=excel_cell.col_span,
-                    start_row_offset_idx=excel_cell.row,
-                    end_row_offset_idx=excel_cell.row + excel_cell.row_span,
-                    start_col_offset_idx=excel_cell.col,
-                    end_col_offset_idx=excel_cell.col + excel_cell.col_span,
-                    col_header=False,
-                    row_header=False,
-                )
-                table_data.table_cells.append(cell)
-            doc.add_table(data=table_data, parent=self.parents[0])
-        return doc
-    def _find_data_tables(self, sheet: Worksheet):
-        """
-        Find all compact rectangular data tables in a sheet.
-        """
-        # _log.info("find_data_tables")
-        tables = []  # List to store found tables
-        visited: set[Tuple[int, int]] = set()  # Track already visited cells
-        # Iterate over all cells in the sheet
-        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
-            for rj, cell in enumerate(row):
-                # Skip empty or already visited cells
-                if cell.value is None or (ri, rj) in visited:
-                    continue
-                # If the cell starts a new table, find its bounds
-                table_bounds, visited_cells = self._find_table_bounds(
-                    sheet, ri, rj, visited
-                )
-                visited.update(visited_cells)  # Mark these cells as visited
-                tables.append(table_bounds)
-        return tables
-    def _find_table_bounds(
-        self,
-        sheet: Worksheet,
-        start_row: int,
-        start_col: int,
-        visited: set[Tuple[int, int]],
-    ):
-        """
-        Determine the bounds of a compact rectangular table.
-        Returns:
-        - A dictionary with the bounds and data.
-        - A set of visited cell coordinates.
-        """
-        _log.info("find_table_bounds")
-        max_row = self._find_table_bottom(sheet, start_row, start_col)
-        max_col = self._find_table_right(sheet, start_row, start_col)
-        # Collect the data within the bounds
-        data = []
-        visited_cells = set()
-        for ri in range(start_row, max_row + 1):
-            for rj in range(start_col, max_col + 1):
-                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
-                # Check if the cell belongs to a merged range
-                row_span = 1
-                col_span = 1
-                # _log.info(sheet.merged_cells.ranges)
-                for merged_range in sheet.merged_cells.ranges:
-                    if (
-                        merged_range.min_row <= ri + 1
-                        and ri + 1 <= merged_range.max_row
-                        and merged_range.min_col <= rj + 1
-                        and rj + 1 <= merged_range.max_col
-                    ):
-                        row_span = merged_range.max_row - merged_range.min_row + 1
-                        col_span = merged_range.max_col - merged_range.min_col + 1
-                        break
-                if (ri, rj) not in visited_cells:
-                    data.append(
-                        ExcelCell(
-                            row=ri - start_row,
-                            col=rj - start_col,
-                            text=str(cell.value),
-                            row_span=row_span,
-                            col_span=col_span,
-                        )
-                    )
-                    # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
-                    # Mark all cells in the span as visited
-                    for span_row in range(ri, ri + row_span):
-                        for span_col in range(rj, rj + col_span):
-                            visited_cells.add((span_row, span_col))
-        return (
-            ExcelTable(
-                num_rows=max_row + 1 - start_row,
-                num_cols=max_col + 1 - start_col,
-                data=data,
-            ),
-            visited_cells,
-        )
-    def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
-        """Function to find the bottom boundary of the table"""
-        max_row = start_row
-        while max_row < sheet.max_row - 1:
-            # Get the cell value or check if it is part of a merged cell
-            cell = sheet.cell(row=max_row + 2, column=start_col + 1)
-            # Check if the cell is part of a merged range
-            merged_range = next(
-                (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
-                None,
-            )
-            if cell.value is None and not merged_range:
-                break  # Stop if the cell is empty and not merged
-            # Expand max_row to include the merged range if applicable
-            if merged_range:
-                max_row = max(max_row, merged_range.max_row - 1)
-            else:
-                max_row += 1
-        return max_row
-    def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
-        """Function to find the right boundary of the table"""
-        max_col = start_col
-        while max_col < sheet.max_column - 1:
-            # Get the cell value or check if it is part of a merged cell
-            cell = sheet.cell(row=start_row + 1, column=max_col + 2)
-            # Check if the cell is part of a merged range
-            merged_range = next(
-                (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
-                None,
-            )
-            if cell.value is None and not merged_range:
-                break  # Stop if the cell is empty and not merged
-            # Expand max_col to include the merged range if applicable
-            if merged_range:
-                max_col = max(max_col, merged_range.max_col - 1)
-            else:
-                max_col += 1
-        return max_col
-    def _find_images_in_sheet(
-        self, doc: DoclingDocument, sheet: Worksheet
-    ) -> DoclingDocument:
-        # Iterate over byte images in the sheet
-        for idx, image in enumerate(sheet._images):  # type: ignore
-            try:
-                pil_image = PILImage.open(image.ref)
-                doc.add_picture(
-                    parent=self.parents[0],
-                    image=ImageRef.from_pil(image=pil_image, dpi=72),
-                    caption=None,
-                )
-            except:
-                _log.error("could not extract the image from excel sheets")
-        """
-        for idx, chart in enumerate(sheet._charts):  # type: ignore
-            try:
-                chart_path = f"chart_{idx + 1}.png"
-                _log.info(
-                    f"Chart found, but dynamic rendering is required for: {chart_path}"
-                )
-                _log.info(f"Chart {idx + 1}:")
-                # Chart type
-                # _log.info(f"Type: {type(chart).__name__}")
-                print(f"Type: {type(chart).__name__}")
-                # Extract series data
-                for series_idx, series in enumerate(chart.series):
-                    #_log.info(f"Series {series_idx + 1}:")
-                    print(f"Series {series_idx + 1} type: {type(series).__name__}")
-                    #print(f"x-values: {series.xVal}")
-                    #print(f"y-values: {series.yVal}")
-                    print(f"xval type: {type(series.xVal).__name__}")
-                    xvals = []
-                    for _ in series.xVal.numLit.pt:
-                        print(f"xval type: {type(_).__name__}")
-                        if hasattr(_, 'v'):
-                            xvals.append(_.v)
-                    print(f"x-values: {xvals}")
-                    yvals = []
-                    for _ in series.yVal:
-                        if hasattr(_, 'v'):
-                            yvals.append(_.v)
-                    print(f"y-values: {yvals}")
-            except Exception as exc:
-                print(exc)
-                continue
-        """
-        return doc

Paper2Video/src/evaluation/PresentQuiz/docling/backend/mspowerpoint_backend.py DELETED Viewed

@@ -1,424 +0,0 @@
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import Set, Union
-from docling_core.types.doc import (
-    BoundingBox,
-    CoordOrigin,
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    ImageRef,
-    ProvenanceItem,
-    Size,
-    TableCell,
-    TableData,
-)
-from PIL import Image, UnidentifiedImageError
-from pptx import Presentation
-from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
-from docling.backend.abstract_backend import (
-    DeclarativeDocumentBackend,
-    PaginatedDocumentBackend,
-)
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self.namespaces = {
-            "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
-            "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
-            "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
-        }
-        # Powerpoint file:
-        self.path_or_stream = path_or_stream
-        self.pptx_obj = None
-        self.valid = False
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.pptx_obj = Presentation(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.pptx_obj = Presentation(str(self.path_or_stream))
-            self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e
-        return
-    def page_count(self) -> int:
-        if self.is_valid():
-            assert self.pptx_obj is not None
-            return len(self.pptx_obj.slides)
-        else:
-            return 0
-    def is_valid(self) -> bool:
-        return self.valid
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return True  # True? if so, how to handle pages...
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PPTX}
-    def convert(self) -> DoclingDocument:
-        # Parses the PPTX into a structured document model.
-        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="application/vnd.ms-powerpoint",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(
-            name=self.file.stem or "file", origin=origin
-        )  # must add origin information
-        doc = self.walk_linear(self.pptx_obj, doc)
-        return doc
-    def generate_prov(
-        self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
-    ):
-        if shape.left:
-            left = shape.left
-            top = shape.top
-            width = shape.width
-            height = shape.height
-        else:
-            left = 0
-            top = 0
-            width = slide_size.width
-            height = slide_size.height
-        shape_bbox = [left, top, left + width, top + height]
-        shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
-        prov = ProvenanceItem(
-            page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
-        )
-        return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
-        is_a_list = False
-        is_list_group_created = False
-        enum_list_item_value = 0
-        new_list = None
-        bullet_type = "None"
-        list_text = ""
-        list_label = GroupLabel.LIST
-        doc_label = DocItemLabel.LIST_ITEM
-        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
-        # Identify if shape contains lists
-        for paragraph in shape.text_frame.paragraphs:
-            # Check if paragraph is a bullet point using the `element` XML
-            p = paragraph._element
-            if (
-                p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
-                is not None
-            ):
-                bullet_type = "Bullet"
-                is_a_list = True
-            elif (
-                p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
-                is not None
-            ):
-                bullet_type = "Numbered"
-                is_a_list = True
-            else:
-                is_a_list = False
-            if paragraph.level > 0:
-                # Most likely a sub-list
-                is_a_list = True
-            if is_a_list:
-                # Determine if this is an unordered list or an ordered list.
-                # Set GroupLabel.ORDERED_LIST when it fits.
-                if bullet_type == "Numbered":
-                    list_label = GroupLabel.ORDERED_LIST
-            if is_a_list:
-                _log.debug("LIST DETECTED!")
-            else:
-                _log.debug("No List")
-        # If there is a list inside of the shape, create a new docling list to assign list items to
-        # if is_a_list:
-        #     new_list = doc.add_group(
-        #         label=list_label, name=f"list", parent=parent_slide
-        #     )
-        # Iterate through paragraphs to build up text
-        for paragraph in shape.text_frame.paragraphs:
-            # p_text = paragraph.text.strip()
-            p = paragraph._element
-            enum_list_item_value += 1
-            inline_paragraph_text = ""
-            inline_list_item_text = ""
-            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
-                if len(e.text.strip()) > 0:
-                    e_is_a_list_item = False
-                    is_numbered = False
-                    if (
-                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Bullet"
-                        e_is_a_list_item = True
-                    elif (
-                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Numbered"
-                        is_numbered = True
-                        e_is_a_list_item = True
-                    else:
-                        e_is_a_list_item = False
-                    if e_is_a_list_item:
-                        if len(inline_paragraph_text) > 0:
-                            # output accumulated inline text:
-                            doc.add_text(
-                                label=doc_label,
-                                parent=parent_slide,
-                                text=inline_paragraph_text,
-                                prov=prov,
-                            )
-                        # Set marker and enumerated arguments if this is an enumeration element.
-                        inline_list_item_text += e.text
-                        # print(e.text)
-                    else:
-                        # Assign proper label to the text, depending if it's a Title or Section Header
-                        # For other types of text, assign - PARAGRAPH
-                        doc_label = DocItemLabel.PARAGRAPH
-                        if shape.is_placeholder:
-                            placeholder_type = shape.placeholder_format.type
-                            if placeholder_type in [
-                                PP_PLACEHOLDER.CENTER_TITLE,
-                                PP_PLACEHOLDER.TITLE,
-                            ]:
-                                # It's a title
-                                doc_label = DocItemLabel.TITLE
-                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
-                                DocItemLabel.SECTION_HEADER
-                        enum_list_item_value = 0
-                        inline_paragraph_text += e.text
-            if len(inline_paragraph_text) > 0:
-                # output accumulated inline text:
-                doc.add_text(
-                    label=doc_label,
-                    parent=parent_slide,
-                    text=inline_paragraph_text,
-                    prov=prov,
-                )
-            if len(inline_list_item_text) > 0:
-                enum_marker = ""
-                if is_numbered:
-                    enum_marker = str(enum_list_item_value) + "."
-                if not is_list_group_created:
-                    new_list = doc.add_group(
-                        label=list_label, name=f"list", parent=parent_slide
-                    )
-                    is_list_group_created = True
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_list,
-                    text=inline_list_item_text,
-                    prov=prov,
-                )
-        return
-    def handle_title(self, shape, parent_slide, slide_ind, doc):
-        placeholder_type = shape.placeholder_format.type
-        txt = shape.text.strip()
-        prov = self.generate_prov(shape, slide_ind, txt)
-        if len(txt.strip()) > 0:
-            # title = slide.shapes.title.text if slide.shapes.title else "No title"
-            if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
-                _log.info(f"Title found: {shape.text}")
-                doc.add_text(
-                    label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
-                )
-            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
-                _log.info(f"Subtitle found: {shape.text}")
-                # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
-                doc.add_text(
-                    label=DocItemLabel.SECTION_HEADER,
-                    parent=parent_slide,
-                    text=txt,
-                    prov=prov,
-                )
-        return
-    def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
-        # Open it with PIL
-        try:
-            # Get the image bytes
-            image = shape.image
-            image_bytes = image.blob
-            im_dpi, _ = image.dpi
-            pil_image = Image.open(BytesIO(image_bytes))
-            # shape has picture
-            prov = self.generate_prov(shape, slide_ind, "", slide_size)
-            doc.add_picture(
-                parent=parent_slide,
-                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
-                caption=None,
-                prov=prov,
-            )
-        except (UnidentifiedImageError, OSError) as e:
-            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
-        return
-    def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
-        # Handling tables, images, charts
-        if shape.has_table:
-            table = shape.table
-            table_xml = shape._element
-            prov = self.generate_prov(shape, slide_ind, "", slide_size)
-            num_cols = 0
-            num_rows = len(table.rows)
-            tcells = []
-            # Access the XML element for the shape that contains the table
-            table_xml = shape._element
-            for row_idx, row in enumerate(table.rows):
-                if len(row.cells) > num_cols:
-                    num_cols = len(row.cells)
-                for col_idx, cell in enumerate(row.cells):
-                    # Access the XML of the cell (this is the 'tc' element in table XML)
-                    cell_xml = table_xml.xpath(
-                        f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
-                    )
-                    if not cell_xml:
-                        continue  # If no cell XML is found, skip
-                    cell_xml = cell_xml[0]  # Get the first matching XML node
-                    row_span = cell_xml.get("rowSpan")  # Vertical span
-                    col_span = cell_xml.get("gridSpan")  # Horizontal span
-                    if row_span is None:
-                        row_span = 1
-                    else:
-                        row_span = int(row_span)
-                    if col_span is None:
-                        col_span = 1
-                    else:
-                        col_span = int(col_span)
-                    icell = TableCell(
-                        text=cell.text.strip(),
-                        row_span=row_span,
-                        col_span=col_span,
-                        start_row_offset_idx=row_idx,
-                        end_row_offset_idx=row_idx + row_span,
-                        start_col_offset_idx=col_idx,
-                        end_col_offset_idx=col_idx + col_span,
-                        col_header=False,
-                        row_header=False,
-                    )
-                    if len(cell.text.strip()) > 0:
-                        tcells.append(icell)
-            # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-            # Populate
-            for tcell in tcells:
-                data.table_cells.append(tcell)
-            if len(tcells) > 0:
-                # If table is not fully empty...
-                # Create Docling table
-                doc.add_table(parent=parent_slide, data=data, prov=prov)
-        return
-    def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
-        # Units of size in PPTX by default are EMU units (English Metric Units)
-        slide_width = pptx_obj.slide_width
-        slide_height = pptx_obj.slide_height
-        text_content = []  # type: ignore
-        max_levels = 10
-        parents = {}  # type: ignore
-        for i in range(0, max_levels):
-            parents[i] = None
-        # Loop through each slide
-        for slide_num, slide in enumerate(pptx_obj.slides):
-            slide_ind = pptx_obj.slides.index(slide)
-            parent_slide = doc.add_group(
-                name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
-            )
-            slide_size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
-            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
-                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
-                if shape.has_table:
-                    # Handle Tables
-                    self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
-                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    # Handle Pictures
-                    self.handle_pictures(
-                        shape, parent_slide, slide_ind, doc, slide_size
-                    )
-                # If shape doesn't have any text, move on to the next shape
-                if not hasattr(shape, "text"):
-                    return
-                if shape.text is None:
-                    return
-                if len(shape.text.strip()) == 0:
-                    return
-                if not shape.has_text_frame:
-                    _log.warning("Warning: shape has text but not text_frame")
-                    return
-                # Handle other text elements, including lists (bullet lists, numbered lists)
-                self.handle_text_elements(
-                    shape, parent_slide, slide_ind, doc, slide_size
-                )
-                return
-            def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
-                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
-                    for groupedshape in shape.shapes:
-                        handle_shapes(
-                            groupedshape, parent_slide, slide_ind, doc, slide_size
-                        )
-            # Loop through each shape in the slide
-            for shape in slide.shapes:
-                handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
-        return doc

Paper2Video/src/evaluation/PresentQuiz/docling/backend/msword_backend.py DELETED Viewed

@@ -1,582 +0,0 @@
-import logging
-import re
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Optional, Union
-from docling_core.types.doc import (
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    ImageRef,
-    NodeItem,
-    TableCell,
-    TableData,
-)
-from docx import Document
-from docx.document import Document as DocxDocument
-from docx.oxml.table import CT_Tc
-from docx.oxml.xmlchemy import BaseOxmlElement
-from docx.table import Table, _Cell
-from docx.text.paragraph import Paragraph
-from lxml import etree
-from lxml.etree import XPath
-from PIL import Image, UnidentifiedImageError
-from typing_extensions import override
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class MsWordDocumentBackend(DeclarativeDocumentBackend):
-    @override
-    def __init__(
-        self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
-    ) -> None:
-        super().__init__(in_doc, path_or_stream)
-        self.XML_KEY = (
-            "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
-        )
-        self.xml_namespaces = {
-            "w": "http://schemas.microsoft.com/office/word/2003/wordml"
-        }
-        # self.initialise(path_or_stream)
-        # Word file:
-        self.path_or_stream: Union[BytesIO, Path] = path_or_stream
-        self.valid: bool = False
-        # Initialise the parents for the hierarchy
-        self.max_levels: int = 10
-        self.level_at_new_list: Optional[int] = None
-        self.parents: dict[int, Optional[NodeItem]] = {}
-        for i in range(-1, self.max_levels):
-            self.parents[i] = None
-        self.level = 0
-        self.listIter = 0
-        self.history: dict[str, Any] = {
-            "names": [None],
-            "levels": [None],
-            "numids": [None],
-            "indents": [None],
-        }
-        self.docx_obj = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = Document(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = Document(str(self.path_or_stream))
-            self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e
-    @override
-    def is_valid(self) -> bool:
-        return self.valid
-    @classmethod
-    @override
-    def supports_pagination(cls) -> bool:
-        return False
-    @override
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    @override
-    def supported_formats(cls) -> set[InputFormat]:
-        return {InputFormat.DOCX}
-    @override
-    def convert(self) -> DoclingDocument:
-        """Parses the DOCX into a structured document model.
-        Returns:
-            The parsed document.
-        """
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        if self.is_valid():
-            assert self.docx_obj is not None
-            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
-            return doc
-        else:
-            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
-            )
-    def update_history(
-        self,
-        name: str,
-        level: Optional[int],
-        numid: Optional[int],
-        ilevel: Optional[int],
-    ):
-        self.history["names"].append(name)
-        self.history["levels"].append(level)
-        self.history["numids"].append(numid)
-        self.history["indents"].append(ilevel)
-    def prev_name(self) -> Optional[str]:
-        return self.history["names"][-1]
-    def prev_level(self) -> Optional[int]:
-        return self.history["levels"][-1]
-    def prev_numid(self) -> Optional[int]:
-        return self.history["numids"][-1]
-    def prev_indent(self) -> Optional[int]:
-        return self.history["indents"][-1]
-    def get_level(self) -> int:
-        """Return the first None index."""
-        for k, v in self.parents.items():
-            if k >= 0 and v == None:
-                return k
-        return 0
-    def walk_linear(
-        self,
-        body: BaseOxmlElement,
-        docx_obj: DocxDocument,
-        doc: DoclingDocument,
-    ) -> DoclingDocument:
-        for element in body:
-            tag_name = etree.QName(element).localname
-            # Check for Inline Images (blip elements)
-            namespaces = {
-                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
-                "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
-                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
-            }
-            xpath_expr = XPath(".//a:blip", namespaces=namespaces)
-            drawing_blip = xpath_expr(element)
-            # Check for Tables
-            if element.tag.endswith("tbl"):
-                try:
-                    self.handle_tables(element, docx_obj, doc)
-                except Exception:
-                    _log.debug("could not parse a table, broken docx table")
-            elif drawing_blip:
-                self.handle_pictures(docx_obj, drawing_blip, doc)
-            # Check for the sdt containers, like table of contents
-            elif tag_name in ["sdt"]:
-                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
-                if sdt_content is not None:
-                    # Iterate paragraphs, runs, or text inside <w:sdtContent>.
-                    paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
-                    for p in paragraphs:
-                        self.handle_text_elements(p, docx_obj, doc)
-            # Check for Text
-            elif tag_name in ["p"]:
-                # "tcPr", "sectPr"
-                self.handle_text_elements(element, docx_obj, doc)
-            else:
-                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
-        return doc
-    def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
-        if s is None:
-            return None
-        try:
-            return int(s)
-        except ValueError:
-            return default
-    def split_text_and_number(self, input_string: str) -> list[str]:
-        match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
-        if match:
-            parts = list(filter(None, match.groups()))
-            return parts
-        else:
-            return [input_string]
-    def get_numId_and_ilvl(
-        self, paragraph: Paragraph
-    ) -> tuple[Optional[int], Optional[int]]:
-        # Access the XML element of the paragraph
-        numPr = paragraph._element.find(
-            ".//w:numPr", namespaces=paragraph._element.nsmap
-        )
-        if numPr is not None:
-            # Get the numId element and extract the value
-            numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
-            ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
-            numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
-            ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
-            return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
-        return None, None  # If the paragraph is not part of a list
-    def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
-        if paragraph.style is None:
-            return "Normal", None
-        label = paragraph.style.style_id
-        if label is None:
-            return "Normal", None
-        if ":" in label:
-            parts = label.split(":")
-            if len(parts) == 2:
-                return parts[0], self.str_to_int(parts[1], None)
-        parts = self.split_text_and_number(label)
-        if "Heading" in label and len(parts) == 2:
-            parts.sort()
-            label_str: str = ""
-            label_level: Optional[int] = 0
-            if parts[0] == "Heading":
-                label_str = parts[0]
-                label_level = self.str_to_int(parts[1], None)
-            if parts[1] == "Heading":
-                label_str = parts[1]
-                label_level = self.str_to_int(parts[0], None)
-            return label_str, label_level
-        else:
-            return label, None
-    def handle_text_elements(
-        self,
-        element: BaseOxmlElement,
-        docx_obj: DocxDocument,
-        doc: DoclingDocument,
-    ) -> None:
-        paragraph = Paragraph(element, docx_obj)
-        if paragraph.text is None:
-            return
-        text = paragraph.text.strip()
-        # Common styles for bullet and numbered lists.
-        # "List Bullet", "List Number", "List Paragraph"
-        # Identify wether list is a numbered list or not
-        # is_numbered = "List Bullet" not in paragraph.style.name
-        is_numbered = False
-        p_style_id, p_level = self.get_label_and_level(paragraph)
-        numid, ilevel = self.get_numId_and_ilvl(paragraph)
-        if numid == 0:
-            numid = None
-        # Handle lists
-        if (
-            numid is not None
-            and ilevel is not None
-            and p_style_id not in ["Title", "Heading"]
-        ):
-            self.add_listitem(
-                doc,
-                numid,
-                ilevel,
-                text,
-                is_numbered,
-            )
-            self.update_history(p_style_id, p_level, numid, ilevel)
-            return
-        elif (
-            numid is None
-            and self.prev_numid() is not None
-            and p_style_id not in ["Title", "Heading"]
-        ):  # Close list
-            if self.level_at_new_list:
-                for key in range(len(self.parents)):
-                    if key >= self.level_at_new_list:
-                        self.parents[key] = None
-                self.level = self.level_at_new_list - 1
-                self.level_at_new_list = None
-            else:
-                for key in range(len(self.parents)):
-                    self.parents[key] = None
-                self.level = 0
-        if p_style_id in ["Title"]:
-            for key in range(len(self.parents)):
-                self.parents[key] = None
-            self.parents[0] = doc.add_text(
-                parent=None, label=DocItemLabel.TITLE, text=text
-            )
-        elif "Heading" in p_style_id:
-            self.add_header(doc, p_level, text)
-        elif p_style_id in [
-            "Paragraph",
-            "Normal",
-            "Subtitle",
-            "Author",
-            "DefaultText",
-            "ListParagraph",
-            "ListBullet",
-            "Quote",
-        ]:
-            level = self.get_level()
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
-            )
-        else:
-            # Text style names can, and will have, not only default values but user values too
-            # hence we treat all other labels as pure text
-            level = self.get_level()
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
-            )
-        self.update_history(p_style_id, p_level, numid, ilevel)
-        return
-    def add_header(
-        self, doc: DoclingDocument, curr_level: Optional[int], text: str
-    ) -> None:
-        level = self.get_level()
-        if isinstance(curr_level, int):
-            if curr_level > level:
-                # add invisible group
-                for i in range(level, curr_level):
-                    self.parents[i] = doc.add_group(
-                        parent=self.parents[i - 1],
-                        label=GroupLabel.SECTION,
-                        name=f"header-{i}",
-                    )
-            elif curr_level < level:
-                # remove the tail
-                for key in range(len(self.parents)):
-                    if key >= curr_level:
-                        self.parents[key] = None
-            self.parents[curr_level] = doc.add_heading(
-                parent=self.parents[curr_level - 1],
-                text=text,
-                level=curr_level,
-            )
-        else:
-            self.parents[self.level] = doc.add_heading(
-                parent=self.parents[self.level - 1],
-                text=text,
-                level=1,
-            )
-        return
-    def add_listitem(
-        self,
-        doc: DoclingDocument,
-        numid: int,
-        ilevel: int,
-        text: str,
-        is_numbered: bool = False,
-    ) -> None:
-        enum_marker = ""
-        level = self.get_level()
-        prev_indent = self.prev_indent()
-        if self.prev_numid() is None:  # Open new list
-            self.level_at_new_list = level
-            self.parents[level] = doc.add_group(
-                label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
-            )
-            # Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
-            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
-            doc.add_list_item(
-                marker=enum_marker,
-                enumerated=is_numbered,
-                parent=self.parents[level],
-                text=text,
-            )
-        elif (
-            self.prev_numid() == numid
-            and self.level_at_new_list is not None
-            and prev_indent is not None
-            and prev_indent < ilevel
-        ):  # Open indented list
-            for i in range(
-                self.level_at_new_list + prev_indent + 1,
-                self.level_at_new_list + ilevel + 1,
-            ):
-                # Determine if this is an unordered list or an ordered list.
-                # Set GroupLabel.ORDERED_LIST when it fits.
-                self.listIter = 0
-                if is_numbered:
-                    self.parents[i] = doc.add_group(
-                        label=GroupLabel.ORDERED_LIST,
-                        name="list",
-                        parent=self.parents[i - 1],
-                    )
-                else:
-                    self.parents[i] = doc.add_group(
-                        label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
-                    )
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
-            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
-            doc.add_list_item(
-                marker=enum_marker,
-                enumerated=is_numbered,
-                parent=self.parents[self.level_at_new_list + ilevel],
-                text=text,
-            )
-        elif (
-            self.prev_numid() == numid
-            and self.level_at_new_list is not None
-            and prev_indent is not None
-            and ilevel < prev_indent
-        ):  # Close list
-            for k, v in self.parents.items():
-                if k > self.level_at_new_list + ilevel:
-                    self.parents[k] = None
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
-            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
-            doc.add_list_item(
-                marker=enum_marker,
-                enumerated=is_numbered,
-                parent=self.parents[self.level_at_new_list + ilevel],
-                text=text,
-            )
-            self.listIter = 0
-        elif self.prev_numid() == numid or prev_indent == ilevel:
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
-            if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
-            doc.add_list_item(
-                marker=enum_marker,
-                enumerated=is_numbered,
-                parent=self.parents[level - 1],
-                text=text,
-            )
-        return
-    def handle_tables(
-        self,
-        element: BaseOxmlElement,
-        docx_obj: DocxDocument,
-        doc: DoclingDocument,
-    ) -> None:
-        table: Table = Table(element, docx_obj)
-        num_rows = len(table.rows)
-        num_cols = len(table.columns)
-        _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
-        if num_rows == 1 and num_cols == 1:
-            cell_element = table.rows[0].cells[0]
-            # In case we have a table of only 1 cell, we consider it furniture
-            # And proceed processing the content of the cell as though it's in the document body
-            self.walk_linear(cell_element._element, docx_obj, doc)
-            return
-        data = TableData(num_rows=num_rows, num_cols=num_cols)
-        cell_set: set[CT_Tc] = set()
-        for row_idx, row in enumerate(table.rows):
-            _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
-            col_idx = 0
-            while col_idx < num_cols:
-                cell: _Cell = row.cells[col_idx]
-                _log.debug(
-                    f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
-                )
-                if cell is None or cell._tc in cell_set:
-                    _log.debug(f"  skipped since repeated content")
-                    col_idx += cell.grid_span
-                    continue
-                else:
-                    cell_set.add(cell._tc)
-                spanned_idx = row_idx
-                spanned_tc: Optional[CT_Tc] = cell._tc
-                while spanned_tc == cell._tc:
-                    spanned_idx += 1
-                    spanned_tc = (
-                        table.rows[spanned_idx].cells[col_idx]._tc
-                        if spanned_idx < num_rows
-                        else None
-                    )
-                _log.debug(f"  spanned before row {spanned_idx}")
-                table_cell = TableCell(
-                    text=cell.text,
-                    row_span=spanned_idx - row_idx,
-                    col_span=cell.grid_span,
-                    start_row_offset_idx=row.grid_cols_before + row_idx,
-                    end_row_offset_idx=row.grid_cols_before + spanned_idx,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + cell.grid_span,
-                    col_header=False,
-                    row_header=False,
-                )
-                data.table_cells.append(table_cell)
-                col_idx += cell.grid_span
-        level = self.get_level()
-        doc.add_table(data=data, parent=self.parents[level - 1])
-        return
-    def handle_pictures(
-        self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
-    ) -> None:
-        def get_docx_image(drawing_blip):
-            rId = drawing_blip[0].get(
-                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
-            )
-            if rId in docx_obj.part.rels:
-                # Access the image part using the relationship ID
-                image_part = docx_obj.part.rels[rId].target_part
-                image_data = image_part.blob  # Get the binary image data
-            return image_data
-        level = self.get_level()
-        # Open the BytesIO object with PIL to create an Image
-        try:
-            image_data = get_docx_image(drawing_blip)
-            image_bytes = BytesIO(image_data)
-            pil_image = Image.open(image_bytes)
-            doc.add_picture(
-                parent=self.parents[level - 1],
-                image=ImageRef.from_pil(image=pil_image, dpi=72),
-                caption=None,
-            )
-        except (UnidentifiedImageError, OSError) as e:
-            _log.warning("Warning: image cannot be loaded by Pillow")
-            doc.add_picture(
-                parent=self.parents[level - 1],
-                caption=None,
-            )
-        return

Paper2Video/src/evaluation/PresentQuiz/docling/backend/pdf_backend.py DELETED Viewed

@@ -1,76 +0,0 @@
-from abc import ABC, abstractmethod
-from io import BytesIO
-from pathlib import Path
-from typing import Iterable, Optional, Set, Union
-from docling_core.types.doc import BoundingBox, Size
-from PIL import Image
-from docling.backend.abstract_backend import PaginatedDocumentBackend
-from docling.datamodel.base_models import Cell, InputFormat
-from docling.datamodel.document import InputDocument
-class PdfPageBackend(ABC):
-    @abstractmethod
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        pass
-    @abstractmethod
-    def get_text_cells(self) -> Iterable[Cell]:
-        pass
-    @abstractmethod
-    def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
-        pass
-    @abstractmethod
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
-    ) -> Image.Image:
-        pass
-    @abstractmethod
-    def get_size(self) -> Size:
-        pass
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-    @abstractmethod
-    def unload(self):
-        pass
-class PdfDocumentBackend(PaginatedDocumentBackend):
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        if self.input_format is not InputFormat.PDF:
-            if self.input_format is InputFormat.IMAGE:
-                buf = BytesIO()
-                img = Image.open(self.path_or_stream)
-                img.save(buf, "PDF")
-                buf.seek(0)
-                self.path_or_stream = buf
-            else:
-                raise RuntimeError(
-                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
-                )
-    @abstractmethod
-    def load_page(self, page_no: int) -> PdfPageBackend:
-        pass
-    @abstractmethod
-    def page_count(self) -> int:
-        pass
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PDF}
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return True

Paper2Video/src/evaluation/PresentQuiz/docling/backend/pypdfium2_backend.py DELETED Viewed

@@ -1,260 +0,0 @@
-import logging
-import random
-from io import BytesIO
-from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
-import pypdfium2 as pdfium
-import pypdfium2.raw as pdfium_c
-from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from PIL import Image, ImageDraw
-from pypdfium2 import PdfTextPage
-from pypdfium2._helpers.misc import PdfiumError
-from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
-if TYPE_CHECKING:
-    from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class PyPdfiumPageBackend(PdfPageBackend):
-    def __init__(
-        self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
-    ):
-        self.valid = True  # No better way to tell from pypdfium.
-        try:
-            self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
-        except PdfiumError as e:
-            _log.info(
-                f"An exception occurred when loading page {page_no} of document {document_hash}.",
-                exc_info=True,
-            )
-            self.valid = False
-        self.text_page: Optional[PdfTextPage] = None
-    def is_valid(self) -> bool:
-        return self.valid
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
-            pos = obj.get_pos()
-            cropbox = BoundingBox.from_tuple(
-                pos, origin=CoordOrigin.BOTTOMLEFT
-            ).to_top_left_origin(page_height=self.get_size().height)
-            if cropbox.area() > AREA_THRESHOLD:
-                cropbox = cropbox.scaled(scale=scale)
-                yield cropbox
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        if not self.text_page:
-            self.text_page = self._ppage.get_textpage()
-        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
-            bbox = bbox.to_bottom_left_origin(self.get_size().height)
-        text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
-        return text_piece
-    def get_text_cells(self) -> Iterable[Cell]:
-        if not self.text_page:
-            self.text_page = self._ppage.get_textpage()
-        cells = []
-        cell_counter = 0
-        page_size = self.get_size()
-        for i in range(self.text_page.count_rects()):
-            rect = self.text_page.get_rect(i)
-            text_piece = self.text_page.get_text_bounded(*rect)
-            x0, y0, x1, y1 = rect
-            cells.append(
-                Cell(
-                    id=cell_counter,
-                    text=text_piece,
-                    bbox=BoundingBox(
-                        l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-        # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
-        # The cell merging code below is to clean this up.
-        def merge_horizontal_cells(
-            cells: List[Cell],
-            horizontal_threshold_factor: float = 1.0,
-            vertical_threshold_factor: float = 0.5,
-        ) -> List[Cell]:
-            if not cells:
-                return []
-            def group_rows(cells: List[Cell]) -> List[List[Cell]]:
-                rows = []
-                current_row = [cells[0]]
-                row_top = cells[0].bbox.t
-                row_bottom = cells[0].bbox.b
-                row_height = cells[0].bbox.height
-                for cell in cells[1:]:
-                    vertical_threshold = row_height * vertical_threshold_factor
-                    if (
-                        abs(cell.bbox.t - row_top) <= vertical_threshold
-                        and abs(cell.bbox.b - row_bottom) <= vertical_threshold
-                    ):
-                        current_row.append(cell)
-                        row_top = min(row_top, cell.bbox.t)
-                        row_bottom = max(row_bottom, cell.bbox.b)
-                        row_height = row_bottom - row_top
-                    else:
-                        rows.append(current_row)
-                        current_row = [cell]
-                        row_top = cell.bbox.t
-                        row_bottom = cell.bbox.b
-                        row_height = cell.bbox.height
-                if current_row:
-                    rows.append(current_row)
-                return rows
-            def merge_row(row: List[Cell]) -> List[Cell]:
-                merged = []
-                current_group = [row[0]]
-                for cell in row[1:]:
-                    prev_cell = current_group[-1]
-                    avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
-                    if (
-                        cell.bbox.l - prev_cell.bbox.r
-                        <= avg_height * horizontal_threshold_factor
-                    ):
-                        current_group.append(cell)
-                    else:
-                        merged.append(merge_group(current_group))
-                        current_group = [cell]
-                if current_group:
-                    merged.append(merge_group(current_group))
-                return merged
-            def merge_group(group: List[Cell]) -> Cell:
-                if len(group) == 1:
-                    return group[0]
-                merged_text = "".join(cell.text for cell in group)
-                merged_bbox = BoundingBox(
-                    l=min(cell.bbox.l for cell in group),
-                    t=min(cell.bbox.t for cell in group),
-                    r=max(cell.bbox.r for cell in group),
-                    b=max(cell.bbox.b for cell in group),
-                )
-                return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
-            rows = group_rows(cells)
-            merged_cells = [cell for row in rows for cell in merge_row(row)]
-            for i, cell in enumerate(merged_cells, 1):
-                cell.id = i
-            return merged_cells
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-        # before merge:
-        # draw_clusters_and_cells()
-        cells = merge_horizontal_cells(cells)
-        # after merge:
-        # draw_clusters_and_cells()
-        return cells
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
-    ) -> Image.Image:
-        page_size = self.get_size()
-        if not cropbox:
-            cropbox = BoundingBox(
-                l=0,
-                r=page_size.width,
-                t=0,
-                b=page_size.height,
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
-            padbox = BoundingBox(
-                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
-            )
-        else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
-            padbox.r = page_size.width - padbox.r
-            padbox.t = page_size.height - padbox.t
-        image = (
-            self._ppage.render(
-                scale=scale * 1.5,
-                rotation=0,  # no additional rotation
-                crop=padbox.as_tuple(),
-            )
-            .to_pil()
-            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
-        )  # We resize the image from 1.5x the given scale to make it sharper.
-        return image
-    def get_size(self) -> Size:
-        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
-    def unload(self):
-        self._ppage = None
-        self.text_page = None
-class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        try:
-            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
-        except PdfiumError as e:
-            raise RuntimeError(
-                f"pypdfium could not load document with hash {self.document_hash}"
-            ) from e
-    def page_count(self) -> int:
-        return len(self._pdoc)
-    def load_page(self, page_no: int) -> PyPdfiumPageBackend:
-        return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
-    def is_valid(self) -> bool:
-        return self.page_count() > 0
-    def unload(self):
-        super().unload()
-        self._pdoc.close()
-        self._pdoc = None

Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/pubmed_backend.py DELETED Viewed

@@ -1,592 +0,0 @@
-import logging
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Set, Union
-import lxml
-from bs4 import BeautifulSoup
-from docling_core.types.doc import (
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    GroupLabel,
-    TableCell,
-    TableData,
-)
-from lxml import etree
-from typing_extensions import TypedDict, override
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-class Paragraph(TypedDict):
-    text: str
-    headers: list[str]
-class Author(TypedDict):
-    name: str
-    affiliation_names: list[str]
-class Table(TypedDict):
-    label: str
-    caption: str
-    content: str
-class FigureCaption(TypedDict):
-    label: str
-    caption: str
-class Reference(TypedDict):
-    author_names: str
-    title: str
-    journal: str
-    year: str
-class XMLComponents(TypedDict):
-    title: str
-    authors: list[Author]
-    abstract: str
-    paragraphs: list[Paragraph]
-    tables: list[Table]
-    figure_captions: list[FigureCaption]
-    references: list[Reference]
-class PubMedDocumentBackend(DeclarativeDocumentBackend):
-    """
-    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
-    Achakulvisut et al., (2020).
-    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
-    Journal of Open Source Software, 5(46), 1979,
-    https://doi.org/10.21105/joss.01979
-    """
-    @override
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
-        self.path_or_stream = path_or_stream
-        # Initialize parents for the document hierarchy
-        self.parents: dict = {}
-        self.valid = False
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.path_or_stream.seek(0)
-            self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
-            if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
-                self.valid = True
-        except Exception as exc:
-            raise RuntimeError(
-                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
-            ) from exc
-    @override
-    def is_valid(self) -> bool:
-        return self.valid
-    @classmethod
-    @override
-    def supports_pagination(cls) -> bool:
-        return False
-    @override
-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-    @classmethod
-    @override
-    def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.XML_PUBMED}
-    @override
-    def convert(self) -> DoclingDocument:
-        # Create empty document
-        origin = DocumentOrigin(
-            filename=self.file.name or "file",
-            mimetype="application/xml",
-            binary_hash=self.document_hash,
-        )
-        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        _log.debug("Trying to convert PubMed XML document...")
-        # Get parsed XML components
-        xml_components: XMLComponents = self._parse()
-        # Add XML components to the document
-        doc = self._populate_document(doc, xml_components)
-        return doc
-    def _parse_title(self) -> str:
-        title: str = " ".join(
-            [
-                t.replace("\n", "")
-                for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
-            ]
-        )
-        return title
-    def _parse_authors(self) -> list[Author]:
-        # Get mapping between affiliation ids and names
-        affiliation_names = []
-        for affiliation_node in self.tree.xpath(".//aff[@id]"):
-            affiliation_names.append(
-                ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
-            )
-        affiliation_ids_names = {
-            id: name
-            for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
-        }
-        # Get author names and affiliation names
-        authors: list[Author] = []
-        for author_node in self.tree.xpath(
-            './/contrib-group/contrib[@contrib-type="author"]'
-        ):
-            author: Author = {
-                "name": "",
-                "affiliation_names": [],
-            }
-            # Affiliation names
-            affiliation_ids = [
-                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
-            ]
-            for id in affiliation_ids:
-                if id in affiliation_ids_names:
-                    author["affiliation_names"].append(affiliation_ids_names[id])
-            # Name
-            author["name"] = (
-                author_node.xpath("name/surname")[0].text
-                + " "
-                + author_node.xpath("name/given-names")[0].text
-            )
-            authors.append(author)
-        return authors
-    def _parse_abstract(self) -> str:
-        texts = []
-        for abstract_node in self.tree.xpath(".//abstract"):
-            for text in abstract_node.itertext():
-                texts.append(text.replace("\n", ""))
-        abstract: str = "".join(texts)
-        return abstract
-    def _parse_main_text(self) -> list[Paragraph]:
-        paragraphs: list[Paragraph] = []
-        for paragraph_node in self.tree.xpath("//body//p"):
-            # Skip captions
-            if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
-                continue
-            paragraph: Paragraph = {"text": "", "headers": []}
-            # Text
-            paragraph["text"] = "".join(
-                [t.replace("\n", "") for t in paragraph_node.itertext()]
-            )
-            # Header
-            path = "../title"
-            while len(paragraph_node.xpath(path)) > 0:
-                paragraph["headers"].append(
-                    "".join(
-                        [
-                            t.replace("\n", "")
-                            for t in paragraph_node.xpath(path)[0].itertext()
-                        ]
-                    )
-                )
-                path = "../" + path
-            paragraphs.append(paragraph)
-        return paragraphs
-    def _parse_tables(self) -> list[Table]:
-        tables: list[Table] = []
-        for table_node in self.tree.xpath(".//body//table-wrap"):
-            table: Table = {"label": "", "caption": "", "content": ""}
-            # Content
-            if len(table_node.xpath("table")) > 0:
-                table_content_node = table_node.xpath("table")[0]
-            elif len(table_node.xpath("alternatives/table")) > 0:
-                table_content_node = table_node.xpath("alternatives/table")[0]
-            else:
-                table_content_node = None
-            if table_content_node != None:
-                table["content"] = etree.tostring(table_content_node).decode("utf-8")
-            # Caption
-            if len(table_node.xpath("caption/p")) > 0:
-                caption_node = table_node.xpath("caption/p")[0]
-            elif len(table_node.xpath("caption/title")) > 0:
-                caption_node = table_node.xpath("caption/title")[0]
-            else:
-                caption_node = None
-            if caption_node != None:
-                table["caption"] = "".join(
-                    [t.replace("\n", "") for t in caption_node.itertext()]
-                )
-            # Label
-            if len(table_node.xpath("label")) > 0:
-                table["label"] = table_node.xpath("label")[0].text
-            tables.append(table)
-        return tables
-    def _parse_figure_captions(self) -> list[FigureCaption]:
-        figure_captions: list[FigureCaption] = []
-        if not (self.tree.xpath(".//fig")):
-            return figure_captions
-        for figure_node in self.tree.xpath(".//fig"):
-            figure_caption: FigureCaption = {
-                "caption": "",
-                "label": "",
-            }
-            # Label
-            if figure_node.xpath("label"):
-                figure_caption["label"] = "".join(
-                    [
-                        t.replace("\n", "")
-                        for t in figure_node.xpath("label")[0].itertext()
-                    ]
-                )
-            # Caption
-            if figure_node.xpath("caption"):
-                caption = ""
-                for caption_node in figure_node.xpath("caption")[0].getchildren():
-                    caption += (
-                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
-                        + "\n"
-                    )
-                figure_caption["caption"] = caption
-            figure_captions.append(figure_caption)
-        return figure_captions
-    def _parse_references(self) -> list[Reference]:
-        references: list[Reference] = []
-        for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
-            reference: Reference = {
-                "author_names": "",
-                "title": "",
-                "journal": "",
-                "year": "",
-            }
-            reference_node: Any = None
-            for tag in ["mixed-citation", "element-citation", "citation"]:
-                if len(reference_node_abs.xpath(tag)) > 0:
-                    reference_node = reference_node_abs.xpath(tag)[0]
-                    break
-            if reference_node is None:
-                continue
-            if all(
-                not (ref_type in ["citation-type", "publication-type"])
-                for ref_type in reference_node.attrib.keys()
-            ):
-                continue
-            # Author names
-            names = []
-            if len(reference_node.xpath("name")) > 0:
-                for name_node in reference_node.xpath("name"):
-                    name_str = " ".join(
-                        [t.text for t in name_node.getchildren() if (t.text != None)]
-                    )
-                    names.append(name_str)
-            elif len(reference_node.xpath("person-group")) > 0:
-                for name_node in reference_node.xpath("person-group")[0]:
-                    name_str = (
-                        name_node.xpath("given-names")[0].text
-                        + " "
-                        + name_node.xpath("surname")[0].text
-                    )
-                    names.append(name_str)
-            reference["author_names"] = "; ".join(names)
-            # Title
-            if len(reference_node.xpath("article-title")) > 0:
-                reference["title"] = " ".join(
-                    [
-                        t.replace("\n", " ")
-                        for t in reference_node.xpath("article-title")[0].itertext()
-                    ]
-                )
-            # Journal
-            if len(reference_node.xpath("source")) > 0:
-                reference["journal"] = reference_node.xpath("source")[0].text
-            # Year
-            if len(reference_node.xpath("year")) > 0:
-                reference["year"] = reference_node.xpath("year")[0].text
-            if (
-                not (reference_node.xpath("article-title"))
-                and not (reference_node.xpath("journal"))
-                and not (reference_node.xpath("year"))
-            ):
-                reference["title"] = reference_node.text
-            references.append(reference)
-        return references
-    def _parse(self) -> XMLComponents:
-        """Parsing PubMed document."""
-        xml_components: XMLComponents = {
-            "title": self._parse_title(),
-            "authors": self._parse_authors(),
-            "abstract": self._parse_abstract(),
-            "paragraphs": self._parse_main_text(),
-            "tables": self._parse_tables(),
-            "figure_captions": self._parse_figure_captions(),
-            "references": self._parse_references(),
-        }
-        return xml_components
-    def _populate_document(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> DoclingDocument:
-        self._add_title(doc, xml_components)
-        self._add_authors(doc, xml_components)
-        self._add_abstract(doc, xml_components)
-        self._add_main_text(doc, xml_components)
-        if xml_components["tables"]:
-            self._add_tables(doc, xml_components)
-        if xml_components["figure_captions"]:
-            self._add_figure_captions(doc, xml_components)
-        self._add_references(doc, xml_components)
-        return doc
-    def _add_figure_captions(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        self.parents["Figures"] = doc.add_heading(
-            parent=self.parents["Title"], text="Figures"
-        )
-        for figure_caption_xml_component in xml_components["figure_captions"]:
-            figure_caption_text = (
-                figure_caption_xml_component["label"]
-                + ": "
-                + figure_caption_xml_component["caption"].strip()
-            )
-            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=figure_caption_text
-            )
-            doc.add_picture(
-                parent=self.parents["Figures"],
-                caption=fig_caption,
-            )
-        return
-    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        self.parents["Title"] = doc.add_text(
-            parent=None,
-            text=xml_components["title"],
-            label=DocItemLabel.TITLE,
-        )
-        return
-    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        authors_affiliations: list = []
-        for author in xml_components["authors"]:
-            authors_affiliations.append(author["name"])
-            authors_affiliations.append(", ".join(author["affiliation_names"]))
-        authors_affiliations_str = "; ".join(authors_affiliations)
-        doc.add_text(
-            parent=self.parents["Title"],
-            text=authors_affiliations_str,
-            label=DocItemLabel.PARAGRAPH,
-        )
-        return
-    def _add_abstract(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        abstract_text: str = xml_components["abstract"]
-        self.parents["Abstract"] = doc.add_heading(
-            parent=self.parents["Title"], text="Abstract"
-        )
-        doc.add_text(
-            parent=self.parents["Abstract"],
-            text=abstract_text,
-            label=DocItemLabel.TEXT,
-        )
-        return
-    def _add_main_text(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        added_headers: list = []
-        for paragraph in xml_components["paragraphs"]:
-            if not (paragraph["headers"]):
-                continue
-            # Header
-            for i, header in enumerate(reversed(paragraph["headers"])):
-                if header in added_headers:
-                    continue
-                added_headers.append(header)
-                if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
-                    i - 1
-                ] in self.parents:
-                    parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
-                else:
-                    parent = self.parents["Title"]
-                self.parents[header] = doc.add_heading(parent=parent, text=header)
-            # Paragraph text
-            if paragraph["headers"][0] in self.parents:
-                parent = self.parents[paragraph["headers"][0]]
-            else:
-                parent = self.parents["Title"]
-            doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
-        return
-    def _add_references(
-        self, doc: DoclingDocument, xml_components: XMLComponents
-    ) -> None:
-        self.parents["References"] = doc.add_heading(
-            parent=self.parents["Title"], text="References"
-        )
-        current_list = doc.add_group(
-            parent=self.parents["References"], label=GroupLabel.LIST, name="list"
-        )
-        for reference in xml_components["references"]:
-            reference_text: str = ""
-            if reference["author_names"]:
-                reference_text += reference["author_names"] + ". "
-            if reference["title"]:
-                reference_text += reference["title"]
-                if reference["title"][-1] != ".":
-                    reference_text += "."
-                reference_text += " "
-            if reference["journal"]:
-                reference_text += reference["journal"]
-            if reference["year"]:
-                reference_text += " (" + reference["year"] + ")"
-            if not (reference_text):
-                _log.debug(f"Skipping reference for: {str(self.file)}")
-                continue
-            doc.add_list_item(
-                text=reference_text, enumerated=False, parent=current_list
-            )
-        return
-    def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
-        self.parents["Tables"] = doc.add_heading(
-            parent=self.parents["Title"], text="Tables"
-        )
-        for table_xml_component in xml_components["tables"]:
-            try:
-                self._add_table(doc, table_xml_component)
-            except Exception as e:
-                _log.debug(f"Skipping unsupported table for: {str(self.file)}")
-                pass
-        return
-    def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
-        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
-        table_tag = soup.find("table")
-        nested_tables = table_tag.find("table")
-        if nested_tables:
-            _log.debug(f"Skipping nested table for: {str(self.file)}")
-            return
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(table_tag.find_all("tr"))
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in table_tag.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(table_tag.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-            # Extract and print the text content of each cell
-            col_idx = 0
-            for _, html_cell in enumerate(cells):
-                text = html_cell.text
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-                while grid[row_idx][col_idx] != None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
-        table_caption = doc.add_text(
-            label=DocItemLabel.CAPTION,
-            text=table_xml_component["label"] + ": " + table_xml_component["caption"],
-        )
-        doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
-        return

Paper2Video/src/evaluation/PresentQuiz/docling/backend/xml/uspto_backend.py DELETED Viewed

@@ -1,1888 +0,0 @@
-"""Backend to parse patents from the United States Patent Office (USPTO).
-The parsers included in this module can handle patent grants pubished since 1976 and
-patent applications since 2001.
-The original files can be found in https://bulkdata.uspto.gov.
-"""
-import html
-import logging
-import re
-import xml.sax
-import xml.sax.xmlreader
-from abc import ABC, abstractmethod
-from enum import Enum, unique
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Final, Optional, Union
-from bs4 import BeautifulSoup, Tag
-from docling_core.types.doc import (
-    DocItem,
-    DocItemLabel,
-    DoclingDocument,
-    DocumentOrigin,
-    TableCell,
-    TableData,
-    TextItem,
-)
-from docling_core.types.doc.document import LevelNumber
-from pydantic import NonNegativeInt
-from typing_extensions import Self, TypedDict, override
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
-_log = logging.getLogger(__name__)
-XML_DECLARATION: Final = '<?xml version="1.0" encoding="UTF-8"?>'
-@unique
-class PatentHeading(Enum):
-    """Text of docling headings for tagged sections in USPTO patent documents."""
-    ABSTRACT = "ABSTRACT", 2
-    CLAIMS = "CLAIMS", 2
-    @override
-    def __new__(cls, value: str, _) -> Self:
-        obj = object.__new__(cls)
-        obj._value_ = value
-        return obj
-    @override
-    def __init__(self, _, level: LevelNumber) -> None:
-        self.level: LevelNumber = level
-class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
-    @override
-    def __init__(
-        self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
-    ) -> None:
-        super().__init__(in_doc, path_or_stream)
-        self.patent_content: str = ""
-        self.parser: Optional[PatentUspto] = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                while line := self.path_or_stream.readline().decode("utf-8"):
-                    if line.startswith("<!DOCTYPE") or line == "PATN\n":
-                        self._set_parser(line)
-                    self.patent_content += line
-            elif isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, encoding="utf-8") as file_obj:
-                    while line := file_obj.readline():
-                        if line.startswith("<!DOCTYPE") or line == "PATN\n":
-                            self._set_parser(line)
-                        self.patent_content += line
-        except Exception as exc:
-            raise RuntimeError(
-                f"Could not initialize USPTO backend for file with hash {self.document_hash}."
-            ) from exc
-    def _set_parser(self, doctype: str) -> None:
-        doctype_line = doctype.lower()
-        if doctype == "PATN\n":
-            self.parser = PatentUsptoGrantAps()
-        elif "us-patent-application-v4" in doctype_line:
-            self.parser = PatentUsptoIce()
-        elif "us-patent-grant-v4" in doctype_line:
-            self.parser = PatentUsptoIce()
-        elif "us-grant-025" in doctype_line:
-            self.parser = PatentUsptoGrantV2()
-        elif all(
-            item in doctype_line
-            for item in ("patent-application-publication", "pap-v1")
-        ):
-            self.parser = PatentUsptoAppV1()
-        else:
-            self.parser = None
-    @override
-    def is_valid(self) -> bool:
-        return bool(self.patent_content) and bool(self.parser)
-    @classmethod
-    @override
-    def supports_pagination(cls) -> bool:
-        return False
-    @override
-    def unload(self) -> None:
-        return
-    @classmethod
-    @override
-    def supported_formats(cls) -> set[InputFormat]:
-        return {InputFormat.XML_USPTO}
-    @override
-    def convert(self) -> DoclingDocument:
-        if self.parser is not None:
-            doc = self.parser.parse(self.patent_content)
-            if doc is None:
-                raise RuntimeError(
-                    f"Failed to convert doc (hash={self.document_hash}, "
-                    f"name={self.file.name})."
-                )
-            doc.name = self.file.name or "file"
-            mime_type = (
-                "text/plain"
-                if isinstance(self.parser, PatentUsptoGrantAps)
-                else "application/xml"
-            )
-            doc.origin = DocumentOrigin(
-                mimetype=mime_type,
-                binary_hash=self.document_hash,
-                filename=self.file.name or "file",
-            )
-            return doc
-        else:
-            raise RuntimeError(
-                f"Cannot convert doc (hash={self.document_hash}, "
-                f"name={self.file.name}) because the backend failed to init."
-            )
-class PatentUspto(ABC):
-    """Parser of patent documents from the US Patent Office."""
-    @abstractmethod
-    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
-        """Parse a USPTO patent.
-        Parameters:
-            patent_content: The content of a single patent in a USPTO file.
-        Returns:
-            The patent parsed as a docling document.
-        """
-        pass
-class PatentUsptoIce(PatentUspto):
-    """Parser of patent documents from the US Patent Office (ICE).
-    The compatible formats are:
-    - Patent Grant Full Text Data/XML Version 4.x ICE (from January 2005)
-    - Patent Application Full Text Data/XML Version 4.x ICE (from January 2005)
-    """
-    def __init__(self) -> None:
-        """Build an instance of PatentUsptoIce class."""
-        self.handler = PatentUsptoIce.PatentHandler()
-        self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
-    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
-        try:
-            xml.sax.parseString(patent_content, self.handler)
-        except xml.sax._exceptions.SAXParseException as exc_sax:
-            _log.error(f"Error in parsing USPTO document: {exc_sax}")
-            return None
-        doc = self.handler.doc
-        if doc:
-            raw_tables = re.findall(self.pattern, patent_content)
-            parsed_tables: list[TableData] = []
-            _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
-            for table in raw_tables:
-                table_parser = XmlTable(XML_DECLARATION + "\n" + table)
-                try:
-                    table_data = table_parser.parse()
-                    if table_data:
-                        parsed_tables.append(table_data)
-                except Exception as exc_table:
-                    _log.error(f"Error in parsing USPTO tables: {exc_table}")
-            if len(parsed_tables) != len(doc.tables):
-                _log.error(
-                    f"Number of referenced ({len(doc.tables)}) and parsed "
-                    f"({len(parsed_tables)}) tables differ."
-                )
-            else:
-                for idx, item in enumerate(parsed_tables):
-                    doc.tables[idx].data = item
-        return doc
-    class PatentHandler(xml.sax.handler.ContentHandler):
-        """SAX ContentHandler for patent documents."""
-        APP_DOC_ELEMENT: Final = "us-patent-application"
-        GRANT_DOC_ELEMENT: Final = "us-patent-grant"
-        @unique
-        class Element(Enum):
-            """Represents an element of interest in the patent application document."""
-            ABSTRACT = "abstract", True
-            TITLE = "invention-title", True
-            CLAIMS = "claims", False
-            CLAIM = "claim", False
-            CLAIM_TEXT = "claim-text", True
-            PARAGRAPH = "p", True
-            HEADING = "heading", True
-            DESCRIPTION = "description", False
-            TABLE = "table", False  # to track its position, without text
-            DRAWINGS = "description-of-drawings", True
-            STYLE_SUPERSCRIPT = "sup", True
-            STYLE_SUBSCRIPT = "sub", True
-            MATHS = "maths", False  # to avoid keeping formulas
-            @override
-            def __new__(cls, value: str, _) -> Self:
-                obj = object.__new__(cls)
-                obj._value_ = value
-                return obj
-            @override
-            def __init__(self, _, is_text: bool) -> None:
-                self.is_text: bool = is_text
-        @override
-        def __init__(self) -> None:
-            """Build an instance of the patent handler."""
-            # Current patent being parsed
-            self.doc: Optional[DoclingDocument] = None
-            # Keep track of docling hierarchy level
-            self.level: LevelNumber = 1
-            # Keep track of docling parents by level
-            self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
-            # Content to retain for the current patent
-            self.property: list[str]
-            self.claim: str
-            self.claims: list[str]
-            self.abstract: str
-            self.text: str
-            self._clean_data()
-            # To handle mathematical styling
-            self.style_html = HtmlEntity()
-        @override
-        def startElement(self, tag, attributes):  # noqa: N802
-            """Signal the start of an element.
-            Args:
-                tag: The element tag.
-                attributes: The element attributes.
-            """
-            if tag in (
-                self.APP_DOC_ELEMENT,
-                self.GRANT_DOC_ELEMENT,
-            ):
-                self.doc = DoclingDocument(name="file")
-                self.text = ""
-            self._start_registered_elements(tag, attributes)
-        @override
-        def skippedEntity(self, name):  # noqa: N802
-            """Receive notification of a skipped entity.
-            HTML entities will be skipped by the parser. This method will unescape them
-            and add them to the text.
-            Args:
-                name: Entity name.
-            """
-            if self.property:
-                elm_val = self.property[-1]
-                element = self.Element(elm_val)
-                if element.is_text:
-                    escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
-                    unescaped = html.unescape(escaped)
-                    if unescaped == escaped:
-                        _log.debug(f"Unrecognized HTML entity: {name}")
-                        return
-                    if element in (
-                        self.Element.STYLE_SUPERSCRIPT,
-                        self.Element.STYLE_SUBSCRIPT,
-                    ):
-                        # superscripts and subscripts need to be under text elements
-                        if len(self.property) < 2:
-                            return
-                        parent_val = self.property[-2]
-                        parent = self.Element(parent_val)
-                        if parent.is_text:
-                            self.text += self._apply_style(unescaped, elm_val)
-                    else:
-                        self.text += unescaped
-        @override
-        def endElement(self, tag):  # noqa: N802
-            """Signal the end of an element.
-            Args:
-                tag: The element tag.
-            """
-            if tag in (
-                self.APP_DOC_ELEMENT,
-                self.GRANT_DOC_ELEMENT,
-            ):
-                self._clean_data()
-            self._end_registered_element(tag)
-        @override
-        def characters(self, content):
-            """Receive notification of character data.
-            Args:
-                content: Data reported by the handler.
-            """
-            if self.property:
-                elm_val = self.property[-1]
-                element = self.Element(elm_val)
-                if element.is_text:
-                    if element in (
-                        self.Element.STYLE_SUPERSCRIPT,
-                        self.Element.STYLE_SUBSCRIPT,
-                    ):
-                        # superscripts and subscripts need to be under text elements
-                        if len(self.property) < 2:
-                            return
-                        parent_val = self.property[-2]
-                        parent = self.Element(parent_val)
-                        if parent.is_text:
-                            self.text += self._apply_style(content, elm_val)
-                    else:
-                        self.text += content
-        def _start_registered_elements(
-            self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
-        ) -> None:
-            if tag in [member.value for member in self.Element]:
-                # special case for claims: claim lines may start before the
-                # previous one is closed
-                if (
-                    tag == self.Element.CLAIM_TEXT.value
-                    and self.property
-                    and self.property[-1] == tag
-                    and self.text.strip()
-                ):
-                    self.claim += " " + self.text.strip()
-                    self.text = ""
-                elif tag == self.Element.HEADING.value:
-                    level_attr: str = attributes.get("level", "")
-                    new_level: int = int(level_attr) if level_attr.isnumeric() else 1
-                    max_level = min(self.parents.keys())
-                    # increase heading level with 1 for title, if any
-                    self.level = (
-                        new_level + 1 if (new_level + 1) in self.parents else max_level
-                    )
-                self.property.append(tag)
-        def _end_registered_element(self, tag: str) -> None:
-            if tag in [item.value for item in self.Element] and self.property:
-                current_tag = self.property.pop()
-                self._add_property(current_tag, self.text.strip())
-        def _add_property(self, name: str, text: str) -> None:
-            if not name or not self.doc:
-                return
-            if name == self.Element.TITLE.value:
-                if text:
-                    self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],
-                        text=text,
-                    )
-                    self.level += 1
-                self.text = ""
-            elif name == self.Element.ABSTRACT.value:
-                if self.abstract:
-                    heading_text = PatentHeading.ABSTRACT.value
-                    heading_level = (
-                        PatentHeading.ABSTRACT.level
-                        if PatentHeading.ABSTRACT.level in self.parents
-                        else 1
-                    )
-                    abstract_item = self.doc.add_heading(
-                        heading_text,
-                        level=heading_level,
-                        parent=self.parents[heading_level],
-                    )
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH,
-                        text=self.abstract,
-                        parent=abstract_item,
-                    )
-            elif name == self.Element.CLAIM_TEXT.value:
-                text = re.sub("\\s+", " ", text).strip()
-                if text:
-                    self.claim += " " + text
-                self.text = ""
-            elif name == self.Element.CLAIM.value and self.claim:
-                self.claims.append(self.claim.strip())
-                self.claim = ""
-            elif name == self.Element.CLAIMS.value and self.claims:
-                heading_text = PatentHeading.CLAIMS.value
-                heading_level = (
-                    PatentHeading.CLAIMS.level
-                    if PatentHeading.CLAIMS.level in self.parents
-                    else 1
-                )
-                claims_item = self.doc.add_heading(
-                    heading_text,
-                    level=heading_level,
-                    parent=self.parents[heading_level],
-                )
-                for text in self.claims:
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
-                    )
-            elif name == self.Element.PARAGRAPH.value and text:
-                # remmove blank spaces added in paragraphs
-                text = re.sub("\\s+", " ", text)
-                if self.Element.ABSTRACT.value in self.property:
-                    self.abstract = (
-                        (self.abstract + " " + text) if self.abstract else text
-                    )
-                else:
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH,
-                        text=text,
-                        parent=self.parents[self.level],
-                    )
-                self.text = ""
-            elif name == self.Element.HEADING.value and text:
-                self.parents[self.level + 1] = self.doc.add_heading(
-                    text=text,
-                    level=self.level,
-                    parent=self.parents[self.level],
-                )
-                self.level += 1
-                self.text = ""
-            elif name == self.Element.TABLE.value:
-                # set an empty table as placeholder
-                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
-                self.doc.add_table(
-                    data=empty_table,
-                    parent=self.parents[self.level],
-                )
-        def _apply_style(self, text: str, style_tag: str) -> str:
-            """Apply an HTML style to text.
-            Args:
-                text: A string containing plain text.
-                style_tag: An HTML tag name for styling text. If the tag name is not
-                  recognized as one of the supported styles, the method will return
-                  the original `text`.
-            Returns:
-                A string after applying the style.
-            """
-            formatted = text
-            if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
-                formatted = html.unescape(self.style_html.get_superscript(text))
-            elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
-                formatted = html.unescape(self.style_html.get_subscript(text))
-            return formatted
-        def _clean_data(self) -> None:
-            """Reset the variables from stream data."""
-            self.property = []
-            self.claim = ""
-            self.claims = []
-            self.abstract = ""
-class PatentUsptoGrantV2(PatentUspto):
-    """Parser of patent documents from the US Patent Office (grants v2.5).
-    The compatible format is:
-    - Patent Grant Full Text Data/XML Version 2.5 (from January 2002 till December 2004)
-    """
-    @override
-    def __init__(self) -> None:
-        """Build an instance of PatentUsptoGrantV2 class."""
-        self.handler = PatentUsptoGrantV2.PatentHandler()
-        self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
-    @override
-    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
-        try:
-            xml.sax.parseString(patent_content, self.handler)
-        except xml.sax._exceptions.SAXParseException as exc_sax:
-            _log.error(f"Error in parsing USPTO document: {exc_sax}")
-            return None
-        doc = self.handler.doc
-        if doc:
-            raw_tables = re.findall(self.pattern, patent_content)
-            parsed_tables: list[TableData] = []
-            _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
-            for table in raw_tables:
-                table_parser = XmlTable(XML_DECLARATION + "\n" + table)
-                try:
-                    table_data = table_parser.parse()
-                    if table_data:
-                        parsed_tables.append(table_data)
-                except Exception as exc_table:
-                    _log.error(f"Error in parsing USPTO tables: {exc_table}")
-            if len(parsed_tables) != len(doc.tables):
-                _log.error(
-                    f"Number of referenced ({len(doc.tables)}) and parsed "
-                    f"({len(parsed_tables)}) tables differ."
-                )
-            else:
-                for idx, item in enumerate(parsed_tables):
-                    doc.tables[idx].data = item
-        return doc
-    class PatentHandler(xml.sax.handler.ContentHandler):
-        """SAX ContentHandler for patent documents."""
-        GRANT_DOC_ELEMENT: Final = "PATDOC"
-        CLAIM_STATEMENT: Final = "What is claimed is:"
-        @unique
-        class Element(Enum):
-            """Represents an element of interest in the patent application document."""
-            PDAT = "PDAT", True  # any type of data
-            ABSTRACT = ("SDOAB", False)
-            SDOCL = ("SDOCL", False)
-            TITLE = ("B540", False)
-            CLAIMS = ("CL", False)
-            CLAIM = ("CLM", False)
-            PARAGRAPH = ("PARA", True)
-            HEADING = ("H", True)
-            DRAWINGS = ("DRWDESC", False)
-            STYLE_SUPERSCRIPT = ("SP", False)
-            STYLE_SUBSCRIPT = ("SB", False)
-            STYLE_ITALIC = ("ITALIC", False)
-            CWU = ("CWU", False)  # avoid tables, chemicals, formulas
-            TABLE = ("table", False)  # to keep track of table positions
-            @override
-            def __new__(cls, value: str, _) -> Self:
-                obj = object.__new__(cls)
-                obj._value_ = value
-                return obj
-            @override
-            def __init__(self, _, is_text: bool) -> None:
-                self.is_text: bool = is_text
-        @override
-        def __init__(self) -> None:
-            """Build an instance of the patent handler."""
-            # Current patent being parsed
-            self.doc: Optional[DoclingDocument] = None
-            # Keep track of docling hierarchy level
-            self.level: LevelNumber = 1
-            # Keep track of docling parents by level
-            self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
-            # Content to retain for the current patent
-            self.property: list[str]
-            self.claim: str
-            self.claims: list[str]
-            self.paragraph: str
-            self.abstract: str
-            self._clean_data()
-            # To handle mathematical styling
-            self.style_html = HtmlEntity()
-        @override
-        def startElement(self, tag, attributes):  # noqa: N802
-            """Signal the start of an element.
-            Args:
-                tag: The element tag.
-                attributes: The element attributes.
-            """
-            if tag == self.GRANT_DOC_ELEMENT:
-                self.doc = DoclingDocument(name="file")
-                self.text = ""
-            self._start_registered_elements(tag, attributes)
-        @override
-        def skippedEntity(self, name):  # noqa: N802
-            """Receive notification of a skipped entity.
-            HTML entities will be skipped by the parser. This method will unescape them
-            and add them to the text.
-            Args:
-                name: Entity name.
-            """
-            if self.property:
-                elm_val = self.property[-1]
-                element = self.Element(elm_val)
-                if element.is_text:
-                    escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
-                    unescaped = html.unescape(escaped)
-                    if unescaped == escaped:
-                        logging.debug("Unrecognized HTML entity: " + name)
-                        return
-                    if element in (
-                        self.Element.STYLE_SUPERSCRIPT,
-                        self.Element.STYLE_SUBSCRIPT,
-                    ):
-                        # superscripts and subscripts need to be under text elements
-                        if len(self.property) < 2:
-                            return
-                        parent_val = self.property[-2]
-                        parent = self.Element(parent_val)
-                        if parent.is_text:
-                            self.text += self._apply_style(unescaped, elm_val)
-                    else:
-                        self.text += unescaped
-        @override
-        def endElement(self, tag):  # noqa: N802
-            """Signal the end of an element.
-            Args:
-                tag: The element tag.
-            """
-            if tag == self.GRANT_DOC_ELEMENT:
-                self._clean_data()
-            self._end_registered_element(tag)
-        @override
-        def characters(self, content):
-            """Receive notification of character data.
-            Args:
-                content: Data reported by the handler.
-            """
-            if self.property:
-                elm_val = self.property[-1]
-                element = self.Element(elm_val)
-                if element.is_text:
-                    if element in (
-                        self.Element.STYLE_SUPERSCRIPT,
-                        self.Element.STYLE_SUBSCRIPT,
-                    ):
-                        # superscripts and subscripts need to be under text elements
-                        if len(self.property) < 2:
-                            return
-                        parent_val = self.property[-2]
-                        parent = self.Element(parent_val)
-                        if parent.is_text:
-                            self.text += self._apply_style(content, elm_val)
-                    else:
-                        self.text += content
-        def _start_registered_elements(
-            self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
-        ) -> None:
-            if tag in [member.value for member in self.Element]:
-                if (
-                    tag == self.Element.HEADING.value
-                    and not self.Element.SDOCL.value in self.property
-                ):
-                    level_attr: str = attributes.get("LVL", "")
-                    new_level: int = int(level_attr) if level_attr.isnumeric() else 1
-                    max_level = min(self.parents.keys())
-                    # increase heading level with 1 for title, if any
-                    self.level = (
-                        new_level + 1 if (new_level + 1) in self.parents else max_level
-                    )
-                self.property.append(tag)
-        def _end_registered_element(self, tag: str) -> None:
-            if tag in [elm.value for elm in self.Element] and self.property:
-                current_tag = self.property.pop()
-                self._add_property(current_tag, self.text)
-        def _add_property(self, name: str, text: str) -> None:
-            if not name or not self.doc:
-                return
-            if name == self.Element.PDAT.value and text:
-                if not self.property:
-                    self.text = ""
-                    return
-                wrapper = self.property[-1]
-                text = self._apply_style(text, wrapper)
-                if self.Element.TITLE.value in self.property and text.strip():
-                    title = text.strip()
-                    self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],
-                        text=title,
-                    )
-                    self.level += 1
-                elif self.Element.ABSTRACT.value in self.property:
-                    self.abstract += text
-                elif self.Element.CLAIM.value in self.property:
-                    self.claim += text
-                # Paragraph text not in claims or abstract
-                elif (
-                    self.Element.PARAGRAPH.value in self.property
-                    and self.Element.CLAIM.value not in self.property
-                    and self.Element.ABSTRACT.value not in self.property
-                ):
-                    self.paragraph += text
-                # headers except claims statement
-                elif (
-                    self.Element.HEADING.value in self.property
-                    and not self.Element.SDOCL.value in self.property
-                    and text.strip()
-                ):
-                    self.parents[self.level + 1] = self.doc.add_heading(
-                        text=text.strip(),
-                        level=self.level,
-                        parent=self.parents[self.level],
-                    )
-                    self.level += 1
-                self.text = ""
-            elif name == self.Element.CLAIM.value and self.claim.strip():
-                self.claims.append(self.claim.strip())
-                self.claim = ""
-            elif name == self.Element.CLAIMS.value and self.claims:
-                heading_text = PatentHeading.CLAIMS.value
-                heading_level = (
-                    PatentHeading.CLAIMS.level
-                    if PatentHeading.CLAIMS.level in self.parents
-                    else 1
-                )
-                claims_item = self.doc.add_heading(
-                    heading_text,
-                    level=heading_level,
-                    parent=self.parents[heading_level],
-                )
-                for text in self.claims:
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
-                    )
-            elif name == self.Element.ABSTRACT.value and self.abstract.strip():
-                abstract = self.abstract.strip()
-                heading_text = PatentHeading.ABSTRACT.value
-                heading_level = (
-                    PatentHeading.ABSTRACT.level
-                    if PatentHeading.ABSTRACT.level in self.parents
-                    else 1
-                )
-                abstract_item = self.doc.add_heading(
-                    heading_text,
-                    level=heading_level,
-                    parent=self.parents[heading_level],
-                )
-                self.doc.add_text(
-                    label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
-                )
-            elif name == self.Element.PARAGRAPH.value:
-                paragraph = self.paragraph.strip()
-                if paragraph and self.Element.CLAIM.value not in self.property:
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH,
-                        text=paragraph,
-                        parent=self.parents[self.level],
-                    )
-                elif self.Element.CLAIM.value in self.property:
-                    # we may need a space after a paragraph in claim text
-                    self.claim += " "
-                self.paragraph = ""
-            elif name == self.Element.TABLE.value:
-                # set an empty table as placeholder
-                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
-                self.doc.add_table(
-                    data=empty_table,
-                    parent=self.parents[self.level],
-                )
-        def _apply_style(self, text: str, style_tag: str) -> str:
-            """Apply an HTML style to text.
-            Args:
-                text: A string containing plain text.
-                style_tag: An HTML tag name for styling text. If the tag name is not
-                  recognized as one of the supported styles, the method will return
-                  the original `text`.
-            Returns:
-                A string after applying the style.
-            """
-            formatted = text
-            if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
-                formatted = html.unescape(self.style_html.get_superscript(text))
-            elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
-                formatted = html.unescape(self.style_html.get_subscript(text))
-            elif style_tag == self.Element.STYLE_ITALIC.value:
-                formatted = html.unescape(self.style_html.get_math_italic(text))
-            return formatted
-        def _clean_data(self) -> None:
-            """Reset the variables from stream data."""
-            self.text = ""
-            self.property = []
-            self.claim = ""
-            self.claims = []
-            self.paragraph = ""
-            self.abstract = ""
-class PatentUsptoGrantAps(PatentUspto):
-    """Parser of patents documents from the US Patent Office (grants APS).
-    The compatible format is:
-    - Patent Grant Full Text Data/APS (from January 1976 till December 2001)
-    """
-    @unique
-    class Section(Enum):
-        """Represent a section in a patent APS document."""
-        ABSTRACT = "ABST"
-        SUMMARY = "BSUM"
-        DETAILS = "DETD"
-        CLAIMS = "CLMS"
-        DRAWINGS = "DRWD"
-    @unique
-    class Field(Enum):
-        """Represent a field in a patent APS document."""
-        DOC_NUMBER = "WKU"
-        TITLE = "TTL"
-        PARAGRAPH = "PAR"
-        PARAGRAPH_1 = "PA1"
-        PARAGRAPH_2 = "PA2"
-        PARAGRAPH_3 = "PA3"
-        TEXT = "PAL"
-        CAPTION = "PAC"
-        NUMBER = "NUM"
-        NAME = "NAM"
-        IPC = "ICL"
-        ISSUED = "ISD"
-        FILED = "APD"
-        PATENT_NUMBER = "PNO"
-        APPLICATION_NUMBER = "APN"
-        APPLICATION_TYPE = "APT"
-        COUNTRY = "CNT"
-    @override
-    def __init__(self) -> None:
-        """Build an instance of PatentUsptoGrantAps class."""
-        self.doc: Optional[DoclingDocument] = None
-        # Keep track of docling hierarchy level
-        self.level: LevelNumber = 1
-        # Keep track of docling parents by level
-        self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
-    def get_last_text_item(self) -> Optional[TextItem]:
-        """Get the last text item at the current document level.
-        Returns:
-            The text item or None, if the current level parent has no children."""
-        if self.doc:
-            parent = self.parents[self.level]
-            children = parent.children if parent is not None else []
-        else:
-            return None
-        text_list: list[TextItem] = [
-            item
-            for item in self.doc.texts
-            if isinstance(item, TextItem) and item.get_ref() in children
-        ]
-        if text_list:
-            return text_list[-1]
-        else:
-            return None
-    def store_section(self, section: str) -> None:
-        """Store the section heading in the docling document.
-        Only the predefined sections from PatentHeading will be handled.
-        The other sections are created by the Field.CAPTION field.
-        Args:
-            section: A patent section name."""
-        heading: PatentHeading
-        if self.doc is None:
-            return
-        elif section == self.Section.ABSTRACT.value:
-            heading = PatentHeading.ABSTRACT
-        elif section == self.Section.CLAIMS.value:
-            heading = PatentHeading.CLAIMS
-        else:
-            return None
-        self.level = heading.level if heading.level in self.parents else 1
-        self.parents[self.level + 1] = self.doc.add_heading(
-            heading.value,
-            level=self.level,
-            parent=self.parents[self.level],
-        )
-        self.level += 1
-    def store_content(self, section: str, field: str, value: str) -> None:
-        """Store the key value within a document section in the docling document.
-        Args:
-            section: A patent section name.
-            field: A field name.
-            value: A field value name.
-        """
-        if (
-            not self.doc
-            or not field
-            or field not in [item.value for item in PatentUsptoGrantAps.Field]
-        ):
-            return
-        if field == self.Field.TITLE.value:
-            self.parents[self.level + 1] = self.doc.add_title(
-                parent=self.parents[self.level], text=value
-            )
-            self.level += 1
-        elif field == self.Field.TEXT.value and section == self.Section.ABSTRACT.value:
-            abst_item = self.get_last_text_item()
-            if abst_item:
-                abst_item.text += " " + value
-            else:
-                self.doc.add_text(
-                    label=DocItemLabel.PARAGRAPH,
-                    text=value,
-                    parent=self.parents[self.level],
-                )
-        elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
-            self.doc.add_text(
-                label=DocItemLabel.PARAGRAPH,
-                text="",
-                parent=self.parents[self.level],
-            )
-        elif (
-            field
-            in (
-                self.Field.PARAGRAPH.value,
-                self.Field.PARAGRAPH_1.value,
-                self.Field.PARAGRAPH_2.value,
-                self.Field.PARAGRAPH_3.value,
-            )
-            and section == self.Section.CLAIMS.value
-        ):
-            last_claim = self.get_last_text_item()
-            if last_claim is None:
-                last_claim = self.doc.add_text(
-                    label=DocItemLabel.PARAGRAPH,
-                    text="",
-                    parent=self.parents[self.level],
-                )
-            last_claim.text += f" {value}" if last_claim.text else value
-        elif field == self.Field.CAPTION.value and section in (
-            self.Section.SUMMARY.value,
-            self.Section.DETAILS.value,
-            self.Section.DRAWINGS.value,
-        ):
-            # captions are siblings of abstract since no level info is provided
-            head_item = PatentHeading.ABSTRACT
-            self.level = head_item.level if head_item.level in self.parents else 1
-            self.parents[self.level + 1] = self.doc.add_heading(
-                value,
-                level=self.level,
-                parent=self.parents[self.level],
-            )
-            self.level += 1
-        elif field in (
-            self.Field.PARAGRAPH.value,
-            self.Field.PARAGRAPH_1.value,
-            self.Field.PARAGRAPH_2.value,
-            self.Field.PARAGRAPH_3.value,
-        ) and section in (
-            self.Section.SUMMARY.value,
-            self.Section.DETAILS.value,
-            self.Section.DRAWINGS.value,
-        ):
-            self.doc.add_text(
-                label=DocItemLabel.PARAGRAPH,
-                text=value,
-                parent=self.parents[self.level],
-            )
-    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
-        self.doc = self.doc = DoclingDocument(name="file")
-        section: str = ""
-        key: str = ""
-        value: str = ""
-        line_num = 0
-        for line in patent_content.splitlines():
-            cols = re.split("\\s{2,}", line, maxsplit=1)
-            if key and value and (len(cols) == 1 or (len(cols) == 2 and cols[0])):
-                self.store_content(section, key, value)
-                key = ""
-                value = ""
-            if len(cols) == 1:  # section title
-                section = cols[0]
-                self.store_section(section)
-                _log.debug(f"Parsing section {section}")
-            elif len(cols) == 2:  # key value
-                if cols[0]:  # key present
-                    key = cols[0]
-                    value = cols[1]
-                elif not re.match(r"^##STR\d+##$", cols[1]):  # line continues
-                    value += " " + cols[1]
-            line_num += 1
-        if key and value:
-            self.store_content(section, key, value)
-        # TODO: parse tables
-        return self.doc
-class PatentUsptoAppV1(PatentUspto):
-    """Parser of patent documents from the US Patent Office (applications v1.x)
-    The compatible format is:
-    - Patent Application Full Text Data/XML Version 1.x (from March 2001 till December
-      2004)
-    """
-    @override
-    def __init__(self) -> None:
-        """Build an instance of PatentUsptoAppV1 class."""
-        self.handler = PatentUsptoAppV1.PatentHandler()
-        self.pattern = re.compile(r"^(<table .*?</table>)", re.MULTILINE | re.DOTALL)
-    @override
-    def parse(self, patent_content: str) -> Optional[DoclingDocument]:
-        try:
-            xml.sax.parseString(patent_content, self.handler)
-        except xml.sax._exceptions.SAXParseException as exc_sax:
-            _log.error(f"Error in parsing USPTO document: {exc_sax}")
-            return None
-        doc = self.handler.doc
-        if doc:
-            raw_tables = re.findall(self.pattern, patent_content)
-            parsed_tables: list[TableData] = []
-            _log.debug(f"Found {len(raw_tables)} tables to be parsed with XmlTable.")
-            for table in raw_tables:
-                table_parser = XmlTable(XML_DECLARATION + "\n" + table)
-                try:
-                    table_data = table_parser.parse()
-                    if table_data:
-                        parsed_tables.append(table_data)
-                except Exception as exc_table:
-                    _log.error(f"Error in parsing USPTO tables: {exc_table}")
-            if len(parsed_tables) != len(doc.tables):
-                _log.error(
-                    f"Number of referenced ({len(doc.tables)}) and parsed "
-                    f"({len(parsed_tables)}) tables differ."
-                )
-            else:
-                for idx, item in enumerate(parsed_tables):
-                    doc.tables[idx].data = item
-        return doc
-    class PatentHandler(xml.sax.handler.ContentHandler):
-        """SAX ContentHandler for patent documents."""
-        APP_DOC_ELEMENT: Final = "patent-application-publication"
-        @unique
-        class Element(Enum):
-            """Represents an element of interest in the patent application document."""
-            DRAWINGS = "brief-description-of-drawings", False
-            ABSTRACT = "subdoc-abstract", False
-            TITLE = "title-of-invention", True
-            CLAIMS = "subdoc-claims", False
-            CLAIM = "claim", False
-            CLAIM_TEXT = "claim-text", True
-            NUMBER = ("number", False)
-            PARAGRAPH = "paragraph", True
-            HEADING = "heading", True
-            STYLE_SUPERSCRIPT = "superscript", True
-            STYLE_SUBSCRIPT = "subscript", True
-            # do not store text of a table, since it can be within paragraph
-            TABLE = "table", False
-            # do not store text of a formula, since it can be within paragraph
-            MATH = "math-cwu", False
-            @override
-            def __new__(cls, value: str, _) -> Self:
-                obj = object.__new__(cls)
-                obj._value_ = value
-                return obj
-            @override
-            def __init__(self, _, is_text: bool) -> None:
-                self.is_text: bool = is_text
-        @override
-        def __init__(self) -> None:
-            """Build an instance of the patent handler."""
-            # Current patent being parsed
-            self.doc: Optional[DoclingDocument] = None
-            # Keep track of docling hierarchy level
-            self.level: LevelNumber = 1
-            # Keep track of docling parents by level
-            self.parents: dict[LevelNumber, Optional[DocItem]] = {1: None}
-            # Content to retain for the current patent
-            self.property: list[str]
-            self.claim: str
-            self.claims: list[str]
-            self.abstract: str
-            self.text: str
-            self._clean_data()
-            # To handle mathematical styling
-            self.style_html = HtmlEntity()
-        @override
-        def startElement(self, tag, attributes):  # noqa: N802
-            """Signal the start of an element.
-            Args:
-                tag: The element tag.
-                attributes: The element attributes.
-            """
-            if tag == self.APP_DOC_ELEMENT:
-                self.doc = DoclingDocument(name="file")
-                self.text = ""
-            self._start_registered_elements(tag, attributes)
-        @override
-        def skippedEntity(self, name):  # noqa: N802
-            """Receive notification of a skipped entity.
-            HTML entities will be skipped by the parser. This method will unescape them
-            and add them to the text.
-            Args:
-                name: Entity name.
-            """
-            if self.property:
-                elm_val = self.property[-1]
-                element = self.Element(elm_val)
-                if element.is_text:
-                    escaped = self.style_html.get_greek_from_iso8879(f"&{name};")
-                    unescaped = html.unescape(escaped)
-                    if unescaped == escaped:
-                        logging.debug("Unrecognized HTML entity: " + name)
-                        return
-                    if element in (
-                        self.Element.STYLE_SUPERSCRIPT,
-                        self.Element.STYLE_SUBSCRIPT,
-                    ):
-                        # superscripts and subscripts need to be under text elements
-                        if len(self.property) < 2:
-                            return
-                        parent_val = self.property[-2]
-                        parent = self.Element(parent_val)
-                        if parent.is_text:
-                            self.text += self._apply_style(unescaped, elm_val)
-                    else:
-                        self.text += unescaped
-        @override
-        def endElement(self, tag):  # noqa: N802
-            """Signal the end of an element.
-            Args:
-                tag: The element tag.
-            """
-            if tag == self.APP_DOC_ELEMENT:
-                self._clean_data()
-            self._end_registered_element(tag)
-        @override
-        def characters(self, content):
-            """Receive notification of character data.
-            Args:
-                content: Data reported by the handler.
-            """
-            if self.property:
-                elm_val = self.property[-1]
-                element = self.Element(elm_val)
-                if element.is_text:
-                    if element in (
-                        self.Element.STYLE_SUPERSCRIPT,
-                        self.Element.STYLE_SUBSCRIPT,
-                    ):
-                        # superscripts and subscripts need to be under text elements
-                        if len(self.property) < 2:
-                            return
-                        parent_val = self.property[-2]
-                        parent = self.Element(parent_val)
-                        if parent.is_text:
-                            self.text += self._apply_style(content, elm_val)
-                    else:
-                        self.text += content
-        def _start_registered_elements(
-            self, tag: str, attributes: xml.sax.xmlreader.AttributesImpl
-        ) -> None:
-            if tag in [member.value for member in self.Element]:
-                # special case for claims: claim lines may start before the
-                # previous one is closed
-                if (
-                    tag == self.Element.CLAIM_TEXT.value
-                    and self.property
-                    and self.property[-1] == tag
-                    and self.text.strip()
-                ):
-                    self.claim += " " + self.text.strip("\n")
-                    self.text = ""
-                elif tag == self.Element.HEADING.value:
-                    level_attr: str = attributes.get("lvl", "")
-                    new_level: int = int(level_attr) if level_attr.isnumeric() else 1
-                    max_level = min(self.parents.keys())
-                    # increase heading level with 1 for title, if any
-                    self.level = (
-                        new_level + 1 if (new_level + 1) in self.parents else max_level
-                    )
-                self.property.append(tag)
-        def _end_registered_element(self, tag: str) -> None:
-            if tag in [elm.value for elm in self.Element] and self.property:
-                current_tag = self.property.pop()
-                self._add_property(current_tag, self.text)
-        def _add_property(self, name: str, text: str) -> None:
-            if not name or not self.doc:
-                return
-            if name == self.Element.TITLE.value:
-                title = text.strip()
-                if title:
-                    self.parents[self.level + 1] = self.doc.add_text(
-                        parent=self.parents[self.level],
-                        label=DocItemLabel.TITLE,
-                        text=title,
-                    )
-                    self.level += 1
-                self.text = ""
-            elif name == self.Element.ABSTRACT.value:
-                abstract = self.abstract.strip()
-                if abstract:
-                    heading_text = PatentHeading.ABSTRACT.value
-                    heading_level = (
-                        PatentHeading.ABSTRACT.level
-                        if PatentHeading.ABSTRACT.level in self.parents
-                        else 1
-                    )
-                    abstract_item = self.doc.add_heading(
-                        heading_text,
-                        level=heading_level,
-                        parent=self.parents[heading_level],
-                    )
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH,
-                        text=self.abstract,
-                        parent=abstract_item,
-                    )
-                    self.abstract = ""
-                self.text = ""
-            elif name == self.Element.CLAIM_TEXT.value:
-                if text:
-                    self.claim += self.text.strip("\n")
-                self.text = ""
-            elif name == self.Element.CLAIM.value:
-                claim = self.claim.strip()
-                if claim:
-                    self.claims.append(claim)
-                self.claim = ""
-            elif name == self.Element.CLAIMS.value and self.claims:
-                heading_text = PatentHeading.CLAIMS.value
-                heading_level = (
-                    PatentHeading.CLAIMS.level
-                    if PatentHeading.CLAIMS.level in self.parents
-                    else 1
-                )
-                claims_item = self.doc.add_heading(
-                    heading_text,
-                    level=heading_level,
-                    parent=self.parents[heading_level],
-                )
-                for text in self.claims:
-                    self.doc.add_text(
-                        label=DocItemLabel.PARAGRAPH, text=text, parent=claims_item
-                    )
-            elif name in (
-                self.Element.PARAGRAPH.value,
-                self.Element.HEADING.value,
-            ):
-                if text and self.Element.ABSTRACT.value in self.property:
-                    self.abstract = (self.abstract + text) if self.abstract else text
-                elif text.strip():
-                    text = re.sub("\\s+", " ", text).strip()
-                    if name == self.Element.HEADING.value:
-                        self.parents[self.level + 1] = self.doc.add_heading(
-                            text=text,
-                            level=self.level,
-                            parent=self.parents[self.level],
-                        )
-                        self.level += 1
-                    else:
-                        self.doc.add_text(
-                            label=DocItemLabel.PARAGRAPH,
-                            text=text,
-                            parent=self.parents[self.level],
-                        )
-                self.text = ""
-            elif name == self.Element.TABLE.value:
-                # set an empty table as placeholder
-                empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
-                self.doc.add_table(
-                    data=empty_table,
-                    parent=self.parents[self.level],
-                )
-        def _apply_style(self, text: str, style_tag: str) -> str:
-            """Apply an HTML style to text.
-            Args:
-                text: A string containing plain text.
-                style_tag: An HTML tag name for styling text. If the tag name is not
-                  recognized as one of the supported styles, the method will return
-                  the original `text`.
-            Returns:
-                A string after applying the style.
-            """
-            formatted = html.unescape(text)
-            if style_tag == self.Element.STYLE_SUPERSCRIPT.value:
-                formatted = html.unescape(self.style_html.get_superscript(formatted))
-            elif style_tag == self.Element.STYLE_SUBSCRIPT.value:
-                formatted = html.unescape(self.style_html.get_subscript(formatted))
-            return formatted
-        def _clean_data(self):
-            """Reset the variables from stream data."""
-            self.property = []
-            self.abstract = ""
-            self.claim = ""
-            self.claims = []
-            self.text = ""
-class XmlTable:
-    """Provide a table parser for xml tables in USPTO patent documents.
-    The OASIS Open XML Exchange Table Model can be downloaded from:
-    http://oasis-open.org/specs/soextblx.dtd
-    """
-    class MinColInfoType(TypedDict):
-        offset: list[int]
-        colwidth: list[int]
-    class ColInfoType(MinColInfoType):
-        cell_range: list[int]
-        cell_offst: list[int]
-    def __init__(self, input: str) -> None:
-        """Initialize the table parser with the xml content.
-        Args:
-            input: The xml content.
-        """
-        self.max_nbr_messages = 2
-        self.nbr_messages = 0
-        self.empty_text = ""
-        self._soup = BeautifulSoup(input, features="xml")
-    def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
-        """Create a unified range along the table groups.
-        Args:
-            tgs: Table group column specifications.
-        Returns:
-            Unified group column specifications.
-        """
-        colinfo: dict[int, XmlTable.ColInfoType] = {}
-        if len(tgs) == 0:
-            return colinfo
-        for itg, tg in enumerate(tgs):
-            colinfo[itg] = {
-                "offset": [],
-                "colwidth": [],
-                "cell_range": [],
-                "cell_offst": [0],
-            }
-            offst = 0
-            for info in tg["colinfo"]:
-                cw = info["colwidth"]
-                cw = re.sub("pt", "", cw, flags=re.I)
-                cw = re.sub("mm", "", cw, flags=re.I)
-                try:
-                    cw = int(cw)
-                except BaseException:
-                    cw = float(cw)
-                colinfo[itg]["colwidth"].append(cw)
-                colinfo[itg]["offset"].append(offst)
-                offst += cw
-            colinfo[itg]["offset"].append(offst)
-        min_colinfo: XmlTable.MinColInfoType = {"offset": [], "colwidth": []}
-        min_colinfo["offset"] = colinfo[0]["offset"]
-        offset_w0 = []
-        for itg, col in colinfo.items():
-            # keep track of col with 0 width
-            for ic, cw in enumerate(col["colwidth"]):
-                if cw == 0:
-                    offset_w0.append(col["offset"][ic])
-            min_colinfo["offset"] = sorted(
-                list(set(col["offset"] + min_colinfo["offset"]))
-            )
-        # add back the 0 width cols to offset list
-        offset_w0 = list(set(offset_w0))
-        min_colinfo["offset"] = sorted(min_colinfo["offset"] + offset_w0)
-        for i in range(len(min_colinfo["offset"]) - 1):
-            min_colinfo["colwidth"].append(
-                min_colinfo["offset"][i + 1] - min_colinfo["offset"][i]
-            )
-        for itg, col in colinfo.items():
-            i = 1
-            range_ = 1
-            for min_i in range(1, len(min_colinfo["offset"])):
-                min_offst = min_colinfo["offset"][min_i]
-                offst = col["offset"][i]
-                if min_offst == offst:
-                    if (
-                        len(col["offset"]) == i + 1
-                        and len(min_colinfo["offset"]) > min_i + 1
-                    ):
-                        range_ += 1
-                    else:
-                        col["cell_range"].append(range_)
-                        col["cell_offst"].append(col["cell_offst"][-1] + range_)
-                        range_ = 1
-                        i += 1
-                elif min_offst < offst:
-                    range_ += 1
-                else:
-                    _log.debug("A USPTO XML table has wrong offsets.")
-                    return {}
-        return colinfo
-    def _get_max_ncols(self, tgs_info: dict[int, ColInfoType]) -> NonNegativeInt:
-        """Get the maximum number of columns across table groups.
-        Args:
-            tgs_info: Unified group column specifications.
-        Return:
-            The maximum number of columns.
-        """
-        ncols_max = 0
-        for rowinfo in tgs_info.values():
-            ncols_max = max(ncols_max, len(rowinfo["colwidth"]))
-        return ncols_max
-    def _parse_table(self, table: Tag) -> TableData:
-        """Parse the content of a table tag.
-        Args:
-            The table element.
-        Returns:
-            A docling table object.
-        """
-        tgs_align = []
-        tg_secs = table.find_all("tgroup")
-        if tg_secs:
-            for tg_sec in tg_secs:
-                ncols = tg_sec.get("cols", None)
-                if ncols:
-                    ncols = int(ncols)
-                tg_align = {"ncols": ncols, "colinfo": []}
-                cs_secs = tg_sec.find_all("colspec")
-                if cs_secs:
-                    for cs_sec in cs_secs:
-                        colname = cs_sec.get("colname", None)
-                        colwidth = cs_sec.get("colwidth", None)
-                        tg_align["colinfo"].append(
-                            {"colname": colname, "colwidth": colwidth}
-                        )
-                tgs_align.append(tg_align)
-        # create unified range along the table groups
-        tgs_range = self._create_tg_range(tgs_align)
-        # if the structure is broken, return an empty table
-        if not tgs_range:
-            dl_table = TableData(num_rows=0, num_cols=0, table_cells=[])
-            return dl_table
-        ncols_max = self._get_max_ncols(tgs_range)
-        # extract table data
-        table_data: list[TableCell] = []
-        i_row_global = 0
-        is_row_empty: bool = True
-        tg_secs = table.find_all("tgroup")
-        if tg_secs:
-            for itg, tg_sec in enumerate(tg_secs):
-                tg_range = tgs_range[itg]
-                row_secs = tg_sec.find_all(["row", "tr"])
-                if row_secs:
-                    for row_sec in row_secs:
-                        entry_secs = row_sec.find_all(["entry", "td"])
-                        is_header: bool = row_sec.parent.name in ["thead"]
-                        ncols = 0
-                        local_row: list[TableCell] = []
-                        is_row_empty = True
-                        if entry_secs:
-                            wrong_nbr_cols = False
-                            for ientry, entry_sec in enumerate(entry_secs):
-                                text = entry_sec.get_text().strip()
-                                # start-end
-                                namest = entry_sec.attrs.get("namest", None)
-                                nameend = entry_sec.attrs.get("nameend", None)
-                                if isinstance(namest, str) and namest.isnumeric():
-                                    namest = int(namest)
-                                else:
-                                    namest = ientry + 1
-                                if isinstance(nameend, str) and nameend.isnumeric():
-                                    nameend = int(nameend)
-                                    shift = 0
-                                else:
-                                    nameend = ientry + 2
-                                    shift = 1
-                                if nameend > len(tg_range["cell_offst"]):
-                                    wrong_nbr_cols = True
-                                    self.nbr_messages += 1
-                                    if self.nbr_messages <= self.max_nbr_messages:
-                                        _log.debug(
-                                            "USPTO table has # entries != # columns"
-                                        )
-                                    break
-                                range_ = [
-                                    tg_range["cell_offst"][namest - 1],
-                                    tg_range["cell_offst"][nameend - 1] - shift,
-                                ]
-                                # add row and replicate cell if needed
-                                cell_text = text if text else self.empty_text
-                                if cell_text != self.empty_text:
-                                    is_row_empty = False
-                                for irep in range(range_[0], range_[1] + 1):
-                                    ncols += 1
-                                    local_row.append(
-                                        TableCell(
-                                            column_header=is_header,
-                                            text=cell_text,
-                                            start_row_offset_idx=i_row_global,
-                                            end_row_offset_idx=i_row_global + 1,
-                                            row_span=1,
-                                            start_col_offset_idx=range_[0],
-                                            end_col_offset_idx=range_[1] + 1,
-                                            col_span=range_[1] - range_[0] + 1,
-                                        )
-                                    )
-                            if wrong_nbr_cols:
-                                # keep empty text, not to introduce noise
-                                local_row = []
-                                ncols = 0
-                            # add empty cell up to ncols_max
-                            for irep in range(ncols, ncols_max):
-                                local_row.append(
-                                    TableCell(
-                                        column_header=is_header,
-                                        text=self.empty_text,
-                                        start_row_offset_idx=i_row_global,
-                                        end_row_offset_idx=i_row_global + 1,
-                                        row_span=1,
-                                        start_col_offset_idx=irep,
-                                        end_col_offset_idx=irep + 1,
-                                        col_span=1,
-                                    )
-                                )
-                        # do not add empty rows
-                        if not is_row_empty:
-                            table_data.extend(local_row)
-                            i_row_global += 1
-        dl_table = TableData(
-            num_rows=i_row_global, num_cols=ncols_max, table_cells=table_data
-        )
-        return dl_table
-    def parse(self) -> Optional[TableData]:
-        """Parse the first table from an xml content.
-        Returns:
-            A docling table data.
-        """
-        section = self._soup.find("table")
-        if section is not None:
-            table = self._parse_table(section)
-            if table.num_rows == 0 or table.num_cols == 0:
-                _log.warning("The parsed USPTO table is empty")
-            return table
-        else:
-            return None
-class HtmlEntity:
-    """Provide utility functions to get the HTML entities of styled characters.
-    This class has been developped from:
-    https://unicode-table.com/en/html-entities/
-    https://www.w3.org/TR/WD-math-970515/table03.html
-    """
-    def __init__(self):
-        """Initialize this class by loading the HTML entity dictionaries."""
-        self.superscript = str.maketrans(
-            {
-                "1": "&sup1;",
-                "2": "&sup2;",
-                "3": "&sup3;",
-                "4": "&#8308;",
-                "5": "&#8309;",
-                "6": "&#8310;",
-                "7": "&#8311;",
-                "8": "&#8312;",
-                "9": "&#8313;",
-                "0": "&#8304;",
-                "+": "&#8314;",
-                "-": "&#8315;",
-                "−": "&#8315;",
-                "=": "&#8316;",
-                "(": "&#8317;",
-                ")": "&#8318;",
-                "a": "&#170;",
-                "o": "&#186;",
-                "i": "&#8305;",
-                "n": "&#8319;",
-            }
-        )
-        self.subscript = str.maketrans(
-            {
-                "1": "&#8321;",
-                "2": "&#8322;",
-                "3": "&#8323;",
-                "4": "&#8324;",
-                "5": "&#8325;",
-                "6": "&#8326;",
-                "7": "&#8327;",
-                "8": "&#8328;",
-                "9": "&#8329;",
-                "0": "&#8320;",
-                "+": "&#8330;",
-                "-": "&#8331;",
-                "−": "&#8331;",
-                "=": "&#8332;",
-                "(": "&#8333;",
-                ")": "&#8334;",
-                "a": "&#8336;",
-                "e": "&#8337;",
-                "o": "&#8338;",
-                "x": "&#8339;",
-            }
-        )
-        self.mathematical_italic = str.maketrans(
-            {
-                "A": "&#119860;",
-                "B": "&#119861;",
-                "C": "&#119862;",
-                "D": "&#119863;",
-                "E": "&#119864;",
-                "F": "&#119865;",
-                "G": "&#119866;",
-                "H": "&#119867;",
-                "I": "&#119868;",
-                "J": "&#119869;",
-                "K": "&#119870;",
-                "L": "&#119871;",
-                "M": "&#119872;",
-                "N": "&#119873;",
-                "O": "&#119874;",
-                "P": "&#119875;",
-                "Q": "&#119876;",
-                "R": "&#119877;",
-                "S": "&#119878;",
-                "T": "&#119879;",
-                "U": "&#119880;",
-                "V": "&#119881;",
-                "W": "&#119882;",
-                "Y": "&#119884;",
-                "Z": "&#119885;",
-                "a": "&#119886;",
-                "b": "&#119887;",
-                "c": "&#119888;",
-                "d": "&#119889;",
-                "e": "&#119890;",
-                "f": "&#119891;",
-                "g": "&#119892;",
-                "h": "&#119893;",
-                "i": "&#119894;",
-                "j": "&#119895;",
-                "k": "&#119896;",
-                "l": "&#119897;",
-                "m": "&#119898;",
-                "n": "&#119899;",
-                "o": "&#119900;",
-                "p": "&#119901;",
-                "q": "&#119902;",
-                "r": "&#119903;",
-                "s": "&#119904;",
-                "t": "&#119905;",
-                "u": "&#119906;",
-                "v": "&#119907;",
-                "w": "&#119908;",
-                "x": "&#119909;",
-                "y": "&#119910;",
-                "z": "&#119911;",
-            }
-        )
-        self.lookup_iso8879 = {
-            "&Agr;": "&Alpha;",
-            "&Bgr;": "&Beta;",
-            "&Ggr;": "&Gamma;",
-            "&Dgr;": "&Delta;",
-            "&Egr;": "&Epsilon;",
-            "&Zgr;": "&Zeta;",
-            "&EEgr;": "&Eta;",
-            "&THgr;": "&Theta;",
-            "&Igr;": "&Iota;",
-            "&Kgr;": "&Kappa;",
-            "&Lgr;": "&Lambda;",
-            "&Mgr;": "&Mu;",
-            "&Ngr;": "&Nu;",
-            "&Xgr;": "&Xi;",
-            "&Ogr;": "&Omicron;",
-            "&Pgr;": "&Pi;",
-            "&Rgr;": "&Rho;",
-            "&Sgr;": "&Sigma;",
-            "&Tgr;": "&Tau;",
-            "&Ugr;": "&Upsilon;",
-            "&PHgr;": "&Phi;",
-            "&KHgr;": "&Chi;",
-            "&PSgr;": "&Psi;",
-            "&OHgr;": "&Omega;",
-            "&agr;": "&alpha;",
-            "&bgr;": "&beta;",
-            "&ggr;": "&gamma;",
-            "&dgr;": "&delta;",
-            "&egr;": "&epsilon;",
-            "&zgr;": "&zeta;",
-            "&eegr;": "&eta;",
-            "&thgr;": "&theta;",
-            "&igr;": "&iota;",
-            "&kgr;": "&kappa;",
-            "&lgr;": "&lambda;",
-            "&mgr;": "&mu;",
-            "&ngr;": "&nu;",
-            "&xgr;": "&xi;",
-            "&ogr;": "&omicron;",
-            "&pgr;": "&pi;",
-            "&rgr;": "&rho;",
-            "&sgr;": "&sigmaf;",
-            "&tgr;": "&tau;",
-            "&ugr;": "&upsilon;",
-            "&phgr;": "&phi;",
-            "&khgr;": "&chi;",
-            "&psgr;": "&psi;",
-            "&ohgr;": "&omega;",
-        }
-    def get_superscript(self, text: str) -> str:
-        """Get a text in superscript as HTML entities.
-        Args:
-            text: The text to transform.
-        Returns:
-            The text in superscript as HTML entities.
-        """
-        return text.translate(self.superscript)
-    def get_subscript(self, text: str) -> str:
-        """Get a text in subscript as HTML entities.
-        Args:
-            The text to transform.
-        Returns:
-            The text in subscript as HTML entities.
-        """
-        return text.translate(self.subscript)
-    def get_math_italic(self, text: str) -> str:
-        """Get a text in italic as HTML entities.
-        Args:
-            The text to transform.
-        Returns:
-            The text in italics as HTML entities.
-        """
-        return text.translate(self.mathematical_italic)
-    def get_greek_from_iso8879(self, text: str) -> str:
-        """Get an HTML entity of a greek letter in ISO 8879.
-        Args:
-            The text to transform, as an ISO 8879 entitiy.
-        Returns:
-            The HTML entity representing a greek letter. If the input text is not
-              supported, the original text is returned.
-        """
-        return self.lookup_iso8879.get(text, text)

Paper2Video/src/evaluation/PresentQuiz/docling/chunking/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
-from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
-from docling_core.transforms.chunker.hierarchical_chunker import (
-    DocChunk,
-    DocMeta,
-    HierarchicalChunker,
-)
-from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

Paper2Video/src/evaluation/PresentQuiz/docling/cli/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/cli/main.py DELETED Viewed

@@ -1,456 +0,0 @@
-import importlib
-import logging
-import platform
-import re
-import sys
-import tempfile
-import time
-import warnings
-from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional, Type
-import typer
-from docling_core.types.doc import ImageRefMode
-from docling_core.utils.file import resolve_source_to_path
-from pydantic import TypeAdapter
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import (
-    ConversionStatus,
-    FormatToExtensions,
-    InputFormat,
-    OutputFormat,
-)
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
-    EasyOcrOptions,
-    OcrEngine,
-    OcrMacOptions,
-    OcrOptions,
-    PdfBackend,
-    PdfPipelineOptions,
-    RapidOcrOptions,
-    TableFormerMode,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
-from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
-warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
-warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
-_log = logging.getLogger(__name__)
-from rich.console import Console
-err_console = Console(stderr=True)
-app = typer.Typer(
-    name="Docling",
-    no_args_is_help=True,
-    add_completion=False,
-    pretty_exceptions_enable=False,
-)
-def version_callback(value: bool):
-    if value:
-        docling_version = importlib.metadata.version("docling")
-        docling_core_version = importlib.metadata.version("docling-core")
-        docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
-        docling_parse_version = importlib.metadata.version("docling-parse")
-        platform_str = platform.platform()
-        py_impl_version = sys.implementation.cache_tag
-        py_lang_version = platform.python_version()
-        print(f"Docling version: {docling_version}")
-        print(f"Docling Core version: {docling_core_version}")
-        print(f"Docling IBM Models version: {docling_ibm_models_version}")
-        print(f"Docling Parse version: {docling_parse_version}")
-        print(f"Python: {py_impl_version} ({py_lang_version})")
-        print(f"Platform: {platform_str}")
-        raise typer.Exit()
-def export_documents(
-    conv_results: Iterable[ConversionResult],
-    output_dir: Path,
-    export_json: bool,
-    export_html: bool,
-    export_md: bool,
-    export_txt: bool,
-    export_doctags: bool,
-    image_export_mode: ImageRefMode,
-):
-    success_count = 0
-    failure_count = 0
-    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            success_count += 1
-            doc_filename = conv_res.input.file.stem
-            # Export JSON format:
-            if export_json:
-                fname = output_dir / f"{doc_filename}.json"
-                _log.info(f"writing JSON output to {fname}")
-                conv_res.document.save_as_json(
-                    filename=fname, image_mode=image_export_mode
-                )
-            # Export HTML format:
-            if export_html:
-                fname = output_dir / f"{doc_filename}.html"
-                _log.info(f"writing HTML output to {fname}")
-                conv_res.document.save_as_html(
-                    filename=fname, image_mode=image_export_mode
-                )
-            # Export Text format:
-            if export_txt:
-                fname = output_dir / f"{doc_filename}.txt"
-                _log.info(f"writing TXT output to {fname}")
-                conv_res.document.save_as_markdown(
-                    filename=fname,
-                    strict_text=True,
-                    image_mode=ImageRefMode.PLACEHOLDER,
-                )
-            # Export Markdown format:
-            if export_md:
-                fname = output_dir / f"{doc_filename}.md"
-                _log.info(f"writing Markdown output to {fname}")
-                conv_res.document.save_as_markdown(
-                    filename=fname, image_mode=image_export_mode
-                )
-            # Export Document Tags format:
-            if export_doctags:
-                fname = output_dir / f"{doc_filename}.doctags"
-                _log.info(f"writing Doc Tags output to {fname}")
-                conv_res.document.save_as_document_tokens(filename=fname)
-        else:
-            _log.warning(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-    _log.info(
-        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
-    )
-def _split_list(raw: Optional[str]) -> Optional[List[str]]:
-    if raw is None:
-        return None
-    return re.split(r"[;,]", raw)
-@app.command(no_args_is_help=True)
-def convert(
-    input_sources: Annotated[
-        List[str],
-        typer.Argument(
-            ...,
-            metavar="source",
-            help="PDF files to convert. Can be local file / directory paths or URL.",
-        ),
-    ],
-    from_formats: List[InputFormat] = typer.Option(
-        None,
-        "--from",
-        help="Specify input formats to convert from. Defaults to all formats.",
-    ),
-    to_formats: List[OutputFormat] = typer.Option(
-        None, "--to", help="Specify output formats. Defaults to Markdown."
-    ),
-    headers: str = typer.Option(
-        None,
-        "--headers",
-        help="Specify http request headers used when fetching url input sources in the form of a JSON string",
-    ),
-    image_export_mode: Annotated[
-        ImageRefMode,
-        typer.Option(
-            ...,
-            help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
-        ),
-    ] = ImageRefMode.EMBEDDED,
-    ocr: Annotated[
-        bool,
-        typer.Option(
-            ..., help="If enabled, the bitmap content will be processed using OCR."
-        ),
-    ] = True,
-    force_ocr: Annotated[
-        bool,
-        typer.Option(
-            ...,
-            help="Replace any existing text with OCR generated text over the full content.",
-        ),
-    ] = False,
-    ocr_engine: Annotated[
-        OcrEngine, typer.Option(..., help="The OCR engine to use.")
-    ] = OcrEngine.EASYOCR,
-    ocr_lang: Annotated[
-        Optional[str],
-        typer.Option(
-            ...,
-            help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
-        ),
-    ] = None,
-    pdf_backend: Annotated[
-        PdfBackend, typer.Option(..., help="The PDF backend to use.")
-    ] = PdfBackend.DLPARSE_V2,
-    table_mode: Annotated[
-        TableFormerMode,
-        typer.Option(..., help="The mode to use in the table structure model."),
-    ] = TableFormerMode.FAST,
-    enrich_code: Annotated[
-        bool,
-        typer.Option(..., help="Enable the code enrichment model in the pipeline."),
-    ] = False,
-    enrich_formula: Annotated[
-        bool,
-        typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
-    ] = False,
-    enrich_picture_classes: Annotated[
-        bool,
-        typer.Option(
-            ...,
-            help="Enable the picture classification enrichment model in the pipeline.",
-        ),
-    ] = False,
-    enrich_picture_description: Annotated[
-        bool,
-        typer.Option(..., help="Enable the picture description model in the pipeline."),
-    ] = False,
-    artifacts_path: Annotated[
-        Optional[Path],
-        typer.Option(..., help="If provided, the location of the model artifacts."),
-    ] = None,
-    abort_on_error: Annotated[
-        bool,
-        typer.Option(
-            ...,
-            "--abort-on-error/--no-abort-on-error",
-            help="If enabled, the bitmap content will be processed using OCR.",
-        ),
-    ] = False,
-    output: Annotated[
-        Path, typer.Option(..., help="Output directory where results are saved.")
-    ] = Path("."),
-    verbose: Annotated[
-        int,
-        typer.Option(
-            "--verbose",
-            "-v",
-            count=True,
-            help="Set the verbosity level. -v for info logging, -vv for debug logging.",
-        ),
-    ] = 0,
-    debug_visualize_cells: Annotated[
-        bool,
-        typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
-    ] = False,
-    debug_visualize_ocr: Annotated[
-        bool,
-        typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
-    ] = False,
-    debug_visualize_layout: Annotated[
-        bool,
-        typer.Option(
-            ..., help="Enable debug output which visualizes the layour clusters"
-        ),
-    ] = False,
-    debug_visualize_tables: Annotated[
-        bool,
-        typer.Option(..., help="Enable debug output which visualizes the table cells"),
-    ] = False,
-    version: Annotated[
-        Optional[bool],
-        typer.Option(
-            "--version",
-            callback=version_callback,
-            is_eager=True,
-            help="Show version information.",
-        ),
-    ] = None,
-    document_timeout: Annotated[
-        Optional[float],
-        typer.Option(
-            ...,
-            help="The timeout for processing each document, in seconds.",
-        ),
-    ] = None,
-    num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
-    device: Annotated[
-        AcceleratorDevice, typer.Option(..., help="Accelerator device")
-    ] = AcceleratorDevice.AUTO,
-):
-    if verbose == 0:
-        logging.basicConfig(level=logging.WARNING)
-    elif verbose == 1:
-        logging.basicConfig(level=logging.INFO)
-    elif verbose == 2:
-        logging.basicConfig(level=logging.DEBUG)
-    settings.debug.visualize_cells = debug_visualize_cells
-    settings.debug.visualize_layout = debug_visualize_layout
-    settings.debug.visualize_tables = debug_visualize_tables
-    settings.debug.visualize_ocr = debug_visualize_ocr
-    if from_formats is None:
-        from_formats = [e for e in InputFormat]
-    parsed_headers: Optional[Dict[str, str]] = None
-    if headers is not None:
-        headers_t = TypeAdapter(Dict[str, str])
-        parsed_headers = headers_t.validate_json(headers)
-    with tempfile.TemporaryDirectory() as tempdir:
-        input_doc_paths: List[Path] = []
-        for src in input_sources:
-            try:
-                # check if we can fetch some remote url
-                source = resolve_source_to_path(
-                    source=src, headers=parsed_headers, workdir=Path(tempdir)
-                )
-                input_doc_paths.append(source)
-            except FileNotFoundError:
-                err_console.print(
-                    f"[red]Error: The input file {src} does not exist.[/red]"
-                )
-                raise typer.Abort()
-            except IsADirectoryError:
-                # if the input matches to a file or a folder
-                try:
-                    local_path = TypeAdapter(Path).validate_python(src)
-                    if local_path.exists() and local_path.is_dir():
-                        for fmt in from_formats:
-                            for ext in FormatToExtensions[fmt]:
-                                input_doc_paths.extend(
-                                    list(local_path.glob(f"**/*.{ext}"))
-                                )
-                                input_doc_paths.extend(
-                                    list(local_path.glob(f"**/*.{ext.upper()}"))
-                                )
-                    elif local_path.exists():
-                        input_doc_paths.append(local_path)
-                    else:
-                        err_console.print(
-                            f"[red]Error: The input file {src} does not exist.[/red]"
-                        )
-                        raise typer.Abort()
-                except Exception as err:
-                    err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
-                    _log.info(err)  # will print more details if verbose is activated
-                    raise typer.Abort()
-        if to_formats is None:
-            to_formats = [OutputFormat.MARKDOWN]
-        export_json = OutputFormat.JSON in to_formats
-        export_html = OutputFormat.HTML in to_formats
-        export_md = OutputFormat.MARKDOWN in to_formats
-        export_txt = OutputFormat.TEXT in to_formats
-        export_doctags = OutputFormat.DOCTAGS in to_formats
-        if ocr_engine == OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.OCRMAC:
-            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.RAPIDOCR:
-            ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
-        else:
-            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
-        ocr_lang_list = _split_list(ocr_lang)
-        if ocr_lang_list is not None:
-            ocr_options.lang = ocr_lang_list
-        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options = PdfPipelineOptions(
-            accelerator_options=accelerator_options,
-            do_ocr=ocr,
-            ocr_options=ocr_options,
-            do_table_structure=True,
-            do_code_enrichment=enrich_code,
-            do_formula_enrichment=enrich_formula,
-            do_picture_description=enrich_picture_description,
-            do_picture_classification=enrich_picture_classes,
-            document_timeout=document_timeout,
-        )
-        pipeline_options.table_structure_options.do_cell_matching = (
-            True  # do_cell_matching
-        )
-        pipeline_options.table_structure_options.mode = table_mode
-        if image_export_mode != ImageRefMode.PLACEHOLDER:
-            pipeline_options.generate_page_images = True
-            pipeline_options.generate_picture_images = (
-                True  # FIXME: to be deprecated in verson 3
-            )
-            pipeline_options.images_scale = 2
-        if artifacts_path is not None:
-            pipeline_options.artifacts_path = artifacts_path
-        if pdf_backend == PdfBackend.DLPARSE_V1:
-            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V2:
-            backend = DoclingParseV2DocumentBackend
-        elif pdf_backend == PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend
-        else:
-            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-        pdf_format_option = PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
-        )
-        format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
-        doc_converter = DocumentConverter(
-            allowed_formats=from_formats,
-            format_options=format_options,
-        )
-        start_time = time.time()
-        conv_results = doc_converter.convert_all(
-            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
-        )
-        output.mkdir(parents=True, exist_ok=True)
-        export_documents(
-            conv_results,
-            output_dir=output,
-            export_json=export_json,
-            export_html=export_html,
-            export_md=export_md,
-            export_txt=export_txt,
-            export_doctags=export_doctags,
-            image_export_mode=image_export_mode,
-        )
-        end_time = time.time() - start_time
-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-click_app = typer.main.get_command(app)
-if __name__ == "__main__":
-    app()

Paper2Video/src/evaluation/PresentQuiz/docling/cli/models.py DELETED Viewed

@@ -1,107 +0,0 @@
-import logging
-import warnings
-from enum import Enum
-from pathlib import Path
-from typing import Annotated, Optional
-import typer
-from rich.console import Console
-from rich.logging import RichHandler
-from docling.datamodel.settings import settings
-from docling.utils.model_downloader import download_models
-warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
-warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
-console = Console()
-err_console = Console(stderr=True)
-app = typer.Typer(
-    name="Docling models helper",
-    no_args_is_help=True,
-    add_completion=False,
-    pretty_exceptions_enable=False,
-)
-class _AvailableModels(str, Enum):
-    LAYOUT = "layout"
-    TABLEFORMER = "tableformer"
-    CODE_FORMULA = "code_formula"
-    PICTURE_CLASSIFIER = "picture_classifier"
-    SMOLVLM = "smolvlm"
-    EASYOCR = "easyocr"
-@app.command("download")
-def download(
-    output_dir: Annotated[
-        Path,
-        typer.Option(
-            ...,
-            "-o",
-            "--output-dir",
-            help="The directory where all the models are downloaded.",
-        ),
-    ] = (settings.cache_dir / "models"),
-    force: Annotated[
-        bool, typer.Option(..., help="If true, the download will be forced")
-    ] = False,
-    models: Annotated[
-        Optional[list[_AvailableModels]],
-        typer.Argument(
-            help=f"Models to download (default behavior: all will be downloaded)",
-        ),
-    ] = None,
-    quiet: Annotated[
-        bool,
-        typer.Option(
-            ...,
-            "-q",
-            "--quiet",
-            help="No extra output is generated, the CLI prints only the directory with the cached models.",
-        ),
-    ] = False,
-):
-    if not quiet:
-        FORMAT = "%(message)s"
-        logging.basicConfig(
-            level=logging.INFO,
-            format="[blue]%(message)s[/blue]",
-            datefmt="[%X]",
-            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
-        )
-    to_download = models or [m for m in _AvailableModels]
-    output_dir = download_models(
-        output_dir=output_dir,
-        force=force,
-        progress=(not quiet),
-        with_layout=_AvailableModels.LAYOUT in to_download,
-        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
-        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
-        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
-        with_smolvlm=_AvailableModels.SMOLVLM in to_download,
-        with_easyocr=_AvailableModels.EASYOCR in to_download,
-    )
-    if quiet:
-        typer.echo(output_dir)
-    else:
-        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
-        console.print(
-            "\n",
-            "Docling can now be configured for running offline using the local artifacts.\n\n",
-            "Using the CLI:",
-            f"`docling --artifacts-path={output_dir} FILE`",
-            "\n",
-            "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
-        )
-click_app = typer.main.get_command(app)
-if __name__ == "__main__":
-    app()

Paper2Video/src/evaluation/PresentQuiz/docling/cli/tools.py DELETED Viewed

@@ -1,17 +0,0 @@
-import typer
-from docling.cli.models import app as models_app
-app = typer.Typer(
-    name="Docling helpers",
-    no_args_is_help=True,
-    add_completion=False,
-    pretty_exceptions_enable=False,
-)
-app.add_typer(models_app, name="models")
-click_app = typer.main.get_command(app)
-if __name__ == "__main__":
-    app()

Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/base_models.py DELETED Viewed

@@ -1,258 +0,0 @@
-from enum import Enum
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
-from docling_core.types.doc import (
-    BoundingBox,
-    DocItemLabel,
-    NodeItem,
-    PictureDataType,
-    Size,
-    TableCell,
-)
-from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
-    DocumentStream,
-)
-from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict
-if TYPE_CHECKING:
-    from docling.backend.pdf_backend import PdfPageBackend
-class ConversionStatus(str, Enum):
-    PENDING = "pending"
-    STARTED = "started"
-    FAILURE = "failure"
-    SUCCESS = "success"
-    PARTIAL_SUCCESS = "partial_success"
-    SKIPPED = "skipped"
-class InputFormat(str, Enum):
-    """A document format supported by document backend parsers."""
-    DOCX = "docx"
-    PPTX = "pptx"
-    HTML = "html"
-    XML_PUBMED = "xml_pubmed"
-    IMAGE = "image"
-    PDF = "pdf"
-    ASCIIDOC = "asciidoc"
-    MD = "md"
-    XLSX = "xlsx"
-    XML_USPTO = "xml_uspto"
-    JSON_DOCLING = "json_docling"
-class OutputFormat(str, Enum):
-    MARKDOWN = "md"
-    JSON = "json"
-    HTML = "html"
-    TEXT = "text"
-    DOCTAGS = "doctags"
-FormatToExtensions: Dict[InputFormat, List[str]] = {
-    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
-    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
-    InputFormat.PDF: ["pdf"],
-    InputFormat.MD: ["md"],
-    InputFormat.HTML: ["html", "htm", "xhtml"],
-    InputFormat.XML_PUBMED: ["xml", "nxml"],
-    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
-    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
-    InputFormat.XLSX: ["xlsx"],
-    InputFormat.XML_USPTO: ["xml", "txt"],
-    InputFormat.JSON_DOCLING: ["json"],
-}
-FormatToMimeType: Dict[InputFormat, List[str]] = {
-    InputFormat.DOCX: [
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
-    ],
-    InputFormat.PPTX: [
-        "application/vnd.openxmlformats-officedocument.presentationml.template",
-        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
-        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-    ],
-    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
-    InputFormat.XML_PUBMED: ["application/xml"],
-    InputFormat.IMAGE: [
-        "image/png",
-        "image/jpeg",
-        "image/tiff",
-        "image/gif",
-        "image/bmp",
-    ],
-    InputFormat.PDF: ["application/pdf"],
-    InputFormat.ASCIIDOC: ["text/asciidoc"],
-    InputFormat.MD: ["text/markdown", "text/x-markdown"],
-    InputFormat.XLSX: [
-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-    ],
-    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
-    InputFormat.JSON_DOCLING: ["application/json"],
-}
-MimeTypeToFormat: dict[str, list[InputFormat]] = {
-    mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
-    for value in FormatToMimeType.values()
-    for mime in value
-}
-class DocInputType(str, Enum):
-    PATH = "path"
-    STREAM = "stream"
-class DoclingComponentType(str, Enum):
-    DOCUMENT_BACKEND = "document_backend"
-    MODEL = "model"
-    DOC_ASSEMBLER = "doc_assembler"
-    USER_INPUT = "user_input"
-class ErrorItem(BaseModel):
-    component_type: DoclingComponentType
-    module_name: str
-    error_message: str
-class Cell(BaseModel):
-    id: int
-    text: str
-    bbox: BoundingBox
-class OcrCell(Cell):
-    confidence: float
-class Cluster(BaseModel):
-    id: int
-    label: DocItemLabel
-    bbox: BoundingBox
-    confidence: float = 1.0
-    cells: List[Cell] = []
-    children: List["Cluster"] = []  # Add child cluster support
-class BasePageElement(BaseModel):
-    label: DocItemLabel
-    id: int
-    page_no: int
-    cluster: Cluster
-    text: Optional[str] = None
-class LayoutPrediction(BaseModel):
-    clusters: List[Cluster] = []
-class ContainerElement(
-    BasePageElement
-):  # Used for Form and Key-Value-Regions, only for typing.
-    pass
-class Table(BasePageElement):
-    otsl_seq: List[str]
-    num_rows: int = 0
-    num_cols: int = 0
-    table_cells: List[TableCell]
-class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, Table] = {}
-class TextElement(BasePageElement):
-    text: str
-class FigureElement(BasePageElement):
-    annotations: List[PictureDataType] = []
-    provenance: Optional[str] = None
-    predicted_class: Optional[str] = None
-    confidence: Optional[float] = None
-class FigureClassificationPrediction(BaseModel):
-    figure_count: int = 0
-    figure_map: Dict[int, FigureElement] = {}
-class EquationPrediction(BaseModel):
-    equation_count: int = 0
-    equation_map: Dict[int, TextElement] = {}
-class PagePredictions(BaseModel):
-    layout: Optional[LayoutPrediction] = None
-    tablestructure: Optional[TableStructurePrediction] = None
-    figures_classification: Optional[FigureClassificationPrediction] = None
-    equations_prediction: Optional[EquationPrediction] = None
-PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
-class AssembledUnit(BaseModel):
-    elements: List[PageElement] = []
-    body: List[PageElement] = []
-    headers: List[PageElement] = []
-class ItemAndImageEnrichmentElement(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    item: NodeItem
-    image: Image
-class Page(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    page_no: int
-    # page_hash: Optional[str] = None
-    size: Optional[Size] = None
-    cells: List[Cell] = []
-    predictions: PagePredictions = PagePredictions()
-    assembled: Optional[AssembledUnit] = None
-    _backend: Optional["PdfPageBackend"] = (
-        None  # Internal PDF backend. By default it is cleared during assembling.
-    )
-    _default_image_scale: float = 1.0  # Default image scale for external usage.
-    _image_cache: Dict[float, Image] = (
-        {}
-    )  # Cache of images in different scales. By default it is cleared during assembling.
-    def get_image(
-        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
-    ) -> Optional[Image]:
-        if self._backend is None:
-            return self._image_cache.get(scale, None)
-        if not scale in self._image_cache:
-            if cropbox is None:
-                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
-            else:
-                return self._backend.get_page_image(scale=scale, cropbox=cropbox)
-        if cropbox is None:
-            return self._image_cache[scale]
-        else:
-            page_im = self._image_cache[scale]
-            assert self.size is not None
-            return page_im.crop(
-                cropbox.to_top_left_origin(page_height=self.size.height)
-                .scaled(scale=scale)
-                .as_tuple()
-            )
-    @property
-    def image(self) -> Optional[Image]:
-        return self.get_image(scale=self._default_image_scale)

Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/document.py DELETED Viewed

@@ -1,394 +0,0 @@
-import logging
-import re
-from enum import Enum
-from io import BytesIO
-from pathlib import Path, PurePath
-from typing import (
-    TYPE_CHECKING,
-    Dict,
-    Iterable,
-    List,
-    Literal,
-    Optional,
-    Set,
-    Type,
-    Union,
-)
-import filetype
-from docling_core.types.doc import (
-    DocItem,
-    DocItemLabel,
-    DoclingDocument,
-    PictureItem,
-    SectionHeaderItem,
-    TableItem,
-    TextItem,
-)
-from docling_core.types.doc.document import ListItem
-from docling_core.types.legacy_doc.base import (
-    BaseText,
-    Figure,
-    GlmTableCell,
-    PageDimensions,
-    PageReference,
-    Prov,
-    Ref,
-)
-from docling_core.types.legacy_doc.base import Table as DsSchemaTable
-from docling_core.types.legacy_doc.base import TableCell
-from docling_core.types.legacy_doc.document import (
-    CCSDocumentDescription as DsDocumentDescription,
-)
-from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
-from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_source_to_stream
-from docling_core.utils.legacy import docling_document_to_legacy
-from pydantic import BaseModel
-from typing_extensions import deprecated
-from docling.backend.abstract_backend import (
-    AbstractDocumentBackend,
-    PaginatedDocumentBackend,
-)
-from docling.datamodel.base_models import (
-    AssembledUnit,
-    ConversionStatus,
-    DocumentStream,
-    ErrorItem,
-    FormatToExtensions,
-    FormatToMimeType,
-    InputFormat,
-    MimeTypeToFormat,
-    Page,
-)
-from docling.datamodel.settings import DocumentLimits
-from docling.utils.profiling import ProfilingItem
-from docling.utils.utils import create_file_hash, create_hash
-if TYPE_CHECKING:
-    from docling.document_converter import FormatOption
-_log = logging.getLogger(__name__)
-layout_label_to_ds_type = {
-    DocItemLabel.TITLE: "title",
-    DocItemLabel.DOCUMENT_INDEX: "table",
-    DocItemLabel.SECTION_HEADER: "subtitle-level-1",
-    DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
-    DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
-    DocItemLabel.CAPTION: "caption",
-    DocItemLabel.PAGE_HEADER: "page-header",
-    DocItemLabel.PAGE_FOOTER: "page-footer",
-    DocItemLabel.FOOTNOTE: "footnote",
-    DocItemLabel.TABLE: "table",
-    DocItemLabel.FORMULA: "equation",
-    DocItemLabel.LIST_ITEM: "paragraph",
-    DocItemLabel.CODE: "paragraph",
-    DocItemLabel.PICTURE: "figure",
-    DocItemLabel.TEXT: "paragraph",
-    DocItemLabel.PARAGRAPH: "paragraph",
-    DocItemLabel.FORM: DocItemLabel.FORM.value,
-    DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
-}
-_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
-class InputDocument(BaseModel):
-    file: PurePath
-    document_hash: str  # = None
-    valid: bool = True
-    limits: DocumentLimits = DocumentLimits()
-    format: InputFormat  # = None
-    filesize: Optional[int] = None
-    page_count: int = 0
-    _backend: AbstractDocumentBackend  # Internal PDF backend used
-    def __init__(
-        self,
-        path_or_stream: Union[BytesIO, Path],
-        format: InputFormat,
-        backend: Type[AbstractDocumentBackend],
-        filename: Optional[str] = None,
-        limits: Optional[DocumentLimits] = None,
-    ):
-        super().__init__(
-            file="", document_hash="", format=InputFormat.PDF
-        )  # initialize with dummy values
-        self.limits = limits or DocumentLimits()
-        self.format = format
-        try:
-            if isinstance(path_or_stream, Path):
-                self.file = path_or_stream
-                self.filesize = path_or_stream.stat().st_size
-                if self.filesize > self.limits.max_file_size:
-                    self.valid = False
-                else:
-                    self.document_hash = create_file_hash(path_or_stream)
-                    self._init_doc(backend, path_or_stream)
-            elif isinstance(path_or_stream, BytesIO):
-                assert (
-                    filename is not None
-                ), "Can't construct InputDocument from stream without providing filename arg."
-                self.file = PurePath(filename)
-                self.filesize = path_or_stream.getbuffer().nbytes
-                if self.filesize > self.limits.max_file_size:
-                    self.valid = False
-                else:
-                    self.document_hash = create_file_hash(path_or_stream)
-                    self._init_doc(backend, path_or_stream)
-            else:
-                raise RuntimeError(
-                    f"Unexpected type path_or_stream: {type(path_or_stream)}"
-                )
-            # For paginated backends, check if the maximum page count is exceeded.
-            if self.valid and self._backend.is_valid():
-                if self._backend.supports_pagination() and isinstance(
-                    self._backend, PaginatedDocumentBackend
-                ):
-                    self.page_count = self._backend.page_count()
-                    if not self.page_count <= self.limits.max_num_pages:
-                        self.valid = False
-                    elif self.page_count < self.limits.page_range[0]:
-                        self.valid = False
-        except (FileNotFoundError, OSError) as e:
-            self.valid = False
-            _log.exception(
-                f"File {self.file.name} not found or cannot be opened.", exc_info=e
-            )
-            # raise
-        except RuntimeError as e:
-            self.valid = False
-            _log.exception(
-                f"An unexpected error occurred while opening the document {self.file.name}",
-                exc_info=e,
-            )
-            # raise
-    def _init_doc(
-        self,
-        backend: Type[AbstractDocumentBackend],
-        path_or_stream: Union[BytesIO, Path],
-    ) -> None:
-        self._backend = backend(self, path_or_stream=path_or_stream)
-        if not self._backend.is_valid():
-            self.valid = False
-class DocumentFormat(str, Enum):
-    V2 = "v2"
-    V1 = "v1"
-class ConversionResult(BaseModel):
-    input: InputDocument
-    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
-    errors: List[ErrorItem] = []  # structure to keep errors
-    pages: List[Page] = []
-    assembled: AssembledUnit = AssembledUnit()
-    timings: Dict[str, ProfilingItem] = {}
-    document: DoclingDocument = _EMPTY_DOCLING_DOC
-    @property
-    @deprecated("Use document instead.")
-    def legacy_document(self):
-        return docling_document_to_legacy(self.document)
-class _DummyBackend(AbstractDocumentBackend):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def is_valid(self) -> bool:
-        return False
-    @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
-        return set()
-    @classmethod
-    def supports_pagination(cls) -> bool:
-        return False
-    def unload(self):
-        return super().unload()
-class _DocumentConversionInput(BaseModel):
-    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
-    headers: Optional[Dict[str, str]] = None
-    limits: Optional[DocumentLimits] = DocumentLimits()
-    def docs(
-        self, format_options: Dict[InputFormat, "FormatOption"]
-    ) -> Iterable[InputDocument]:
-        for item in self.path_or_stream_iterator:
-            obj = (
-                resolve_source_to_stream(item, self.headers)
-                if isinstance(item, str)
-                else item
-            )
-            format = self._guess_format(obj)
-            backend: Type[AbstractDocumentBackend]
-            if format not in format_options.keys():
-                _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
-                )
-                backend = _DummyBackend
-            else:
-                backend = format_options[format].backend
-            if isinstance(obj, Path):
-                yield InputDocument(
-                    path_or_stream=obj,
-                    format=format,  # type: ignore[arg-type]
-                    filename=obj.name,
-                    limits=self.limits,
-                    backend=backend,
-                )
-            elif isinstance(obj, DocumentStream):
-                yield InputDocument(
-                    path_or_stream=obj.stream,
-                    format=format,  # type: ignore[arg-type]
-                    filename=obj.name,
-                    limits=self.limits,
-                    backend=backend,
-                )
-            else:
-                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
-    def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
-        content = b""  # empty binary blob
-        formats: list[InputFormat] = []
-        if isinstance(obj, Path):
-            mime = filetype.guess_mime(str(obj))
-            if mime is None:
-                ext = obj.suffix[1:]
-                mime = _DocumentConversionInput._mime_from_extension(ext)
-            if mime is None:  # must guess from
-                with obj.open("rb") as f:
-                    content = f.read(1024)  # Read first 1KB
-        elif isinstance(obj, DocumentStream):
-            content = obj.stream.read(8192)
-            obj.stream.seek(0)
-            mime = filetype.guess_mime(content)
-            if mime is None:
-                ext = (
-                    obj.name.rsplit(".", 1)[-1]
-                    if ("." in obj.name and not obj.name.startswith("."))
-                    else ""
-                )
-                mime = _DocumentConversionInput._mime_from_extension(ext)
-        mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
-        mime = mime or "text/plain"
-        formats = MimeTypeToFormat.get(mime, [])
-        if formats:
-            if len(formats) == 1 and mime not in ("text/plain"):
-                return formats[0]
-            else:  # ambiguity in formats
-                return _DocumentConversionInput._guess_from_content(
-                    content, mime, formats
-                )
-        else:
-            return None
-    @staticmethod
-    def _guess_from_content(
-        content: bytes, mime: str, formats: list[InputFormat]
-    ) -> Optional[InputFormat]:
-        """Guess the input format of a document by checking part of its content."""
-        input_format: Optional[InputFormat] = None
-        content_str = content.decode("utf-8")
-        if mime == "application/xml":
-            match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
-            if match_doctype:
-                xml_doctype = match_doctype.group()
-                if InputFormat.XML_USPTO in formats and any(
-                    item in xml_doctype
-                    for item in (
-                        "us-patent-application-v4",
-                        "us-patent-grant-v4",
-                        "us-grant-025",
-                        "patent-application-publication",
-                    )
-                ):
-                    input_format = InputFormat.XML_USPTO
-                if (
-                    InputFormat.XML_PUBMED in formats
-                    and "/NLM//DTD JATS" in xml_doctype
-                ):
-                    input_format = InputFormat.XML_PUBMED
-        elif mime == "text/plain":
-            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
-                input_format = InputFormat.XML_USPTO
-        return input_format
-    @staticmethod
-    def _mime_from_extension(ext):
-        mime = None
-        if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
-            mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
-        elif ext in FormatToExtensions[InputFormat.HTML]:
-            mime = FormatToMimeType[InputFormat.HTML][0]
-        elif ext in FormatToExtensions[InputFormat.MD]:
-            mime = FormatToMimeType[InputFormat.MD][0]
-        elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
-            mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
-        elif ext in FormatToExtensions[InputFormat.PDF]:
-            mime = FormatToMimeType[InputFormat.PDF][0]
-        return mime
-    @staticmethod
-    def _detect_html_xhtml(
-        content: bytes,
-    ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
-        """Guess the mime type of an XHTML, HTML, or XML file from its content.
-        Args:
-            content: A short piece of a document from its beginning.
-        Returns:
-            The mime type of an XHTML, HTML, or XML file, or None if the content does
-              not match any of these formats.
-        """
-        content_str = content.decode("ascii", errors="ignore").lower()
-        # Remove XML comments
-        content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
-        content_str = content_str.lstrip()
-        if re.match(r"<\?xml", content_str):
-            if "xhtml" in content_str[:1000]:
-                return "application/xhtml+xml"
-            else:
-                return "application/xml"
-        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
-            return "text/html"
-        p = re.compile(
-            r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
-        )
-        if p.search(content_str):
-            return "application/xml"
-        return None

Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/pipeline_options.py DELETED Viewed

@@ -1,296 +0,0 @@
-import logging
-import os
-from enum import Enum
-from pathlib import Path
-from typing import Annotated, Any, Dict, List, Literal, Optional, Union
-from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
-from pydantic_settings import BaseSettings, SettingsConfigDict
-_log = logging.getLogger(__name__)
-class AcceleratorDevice(str, Enum):
-    """Devices to run model inference"""
-    AUTO = "auto"
-    CPU = "cpu"
-    CUDA = "cuda"
-    MPS = "mps"
-class AcceleratorOptions(BaseSettings):
-    model_config = SettingsConfigDict(
-        env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
-    )
-    num_threads: int = 4
-    device: AcceleratorDevice = AcceleratorDevice.AUTO
-    @model_validator(mode="before")
-    @classmethod
-    def check_alternative_envvars(cls, data: Any) -> Any:
-        r"""
-        Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
-        The alternative envvar is used only if it is valid and the regular envvar is not set.
-        Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
-        the same functionality. In case the alias envvar is set and the user tries to override the
-        parameter in settings initialization, Pydantic treats the parameter provided in __init__()
-        as an extra input instead of simply overwriting the evvar value for that parameter.
-        """
-        if isinstance(data, dict):
-            input_num_threads = data.get("num_threads")
-            # Check if to set the num_threads from the alternative envvar
-            if input_num_threads is None:
-                docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
-                omp_num_threads = os.getenv("OMP_NUM_THREADS")
-                if docling_num_threads is None and omp_num_threads is not None:
-                    try:
-                        data["num_threads"] = int(omp_num_threads)
-                    except ValueError:
-                        _log.error(
-                            "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
-                            omp_num_threads,
-                        )
-        return data
-class TableFormerMode(str, Enum):
-    """Modes for the TableFormer model."""
-    FAST = "fast"
-    ACCURATE = "accurate"
-class TableStructureOptions(BaseModel):
-    """Options for the table structure."""
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-    mode: TableFormerMode = TableFormerMode.FAST
-class OcrOptions(BaseModel):
-    """OCR options."""
-    kind: str
-    lang: List[str]
-    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
-    bitmap_area_threshold: float = (
-        0.05  # percentage of the area for a bitmap to processed with OCR
-    )
-class RapidOcrOptions(OcrOptions):
-    """Options for the RapidOCR engine."""
-    kind: Literal["rapidocr"] = "rapidocr"
-    # English and chinese are the most commly used models and have been tested with RapidOCR.
-    lang: List[str] = [
-        "english",
-        "chinese",
-    ]  # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
-    # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
-    # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
-    text_score: float = 0.5  # same default as rapidocr
-    use_det: Optional[bool] = None  # same default as rapidocr
-    use_cls: Optional[bool] = None  # same default as rapidocr
-    use_rec: Optional[bool] = None  # same default as rapidocr
-    # class Device(Enum):
-    #     CPU = "CPU"
-    #     CUDA = "CUDA"
-    #     DIRECTML = "DIRECTML"
-    #     AUTO = "AUTO"
-    # device: Device = Device.AUTO  # Default value is AUTO
-    print_verbose: bool = False  # same default as rapidocr
-    det_model_path: Optional[str] = None  # same default as rapidocr
-    cls_model_path: Optional[str] = None  # same default as rapidocr
-    rec_model_path: Optional[str] = None  # same default as rapidocr
-    rec_keys_path: Optional[str] = None  # same default as rapidocr
-    model_config = ConfigDict(
-        extra="forbid",
-    )
-class EasyOcrOptions(OcrOptions):
-    """Options for the EasyOCR engine."""
-    kind: Literal["easyocr"] = "easyocr"
-    lang: List[str] = ["fr", "de", "es", "en"]
-    use_gpu: Optional[bool] = None
-    confidence_threshold: float = 0.5
-    model_storage_directory: Optional[str] = None
-    recog_network: Optional[str] = "standard"
-    download_enabled: bool = True
-    model_config = ConfigDict(
-        extra="forbid",
-        protected_namespaces=(),
-    )
-class TesseractCliOcrOptions(OcrOptions):
-    """Options for the TesseractCli engine."""
-    kind: Literal["tesseract"] = "tesseract"
-    lang: List[str] = ["fra", "deu", "spa", "eng"]
-    tesseract_cmd: str = "tesseract"
-    path: Optional[str] = None
-    model_config = ConfigDict(
-        extra="forbid",
-    )
-class TesseractOcrOptions(OcrOptions):
-    """Options for the Tesseract engine."""
-    kind: Literal["tesserocr"] = "tesserocr"
-    lang: List[str] = ["fra", "deu", "spa", "eng"]
-    path: Optional[str] = None
-    model_config = ConfigDict(
-        extra="forbid",
-    )
-class OcrMacOptions(OcrOptions):
-    """Options for the Mac OCR engine."""
-    kind: Literal["ocrmac"] = "ocrmac"
-    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
-    recognition: str = "accurate"
-    framework: str = "vision"
-    model_config = ConfigDict(
-        extra="forbid",
-    )
-class PictureDescriptionBaseOptions(BaseModel):
-    kind: str
-    batch_size: int = 8
-    scale: float = 2
-    bitmap_area_threshold: float = (
-        0.2  # percentage of the area for a bitmap to processed with the models
-    )
-class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
-    kind: Literal["api"] = "api"
-    url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
-    headers: Dict[str, str] = {}
-    params: Dict[str, Any] = {}
-    timeout: float = 20
-    prompt: str = "Describe this image in a few sentences."
-    provenance: str = ""
-class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
-    kind: Literal["vlm"] = "vlm"
-    repo_id: str
-    prompt: str = "Describe this image in a few sentences."
-    # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
-    generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
-    @property
-    def repo_cache_folder(self) -> str:
-        return self.repo_id.replace("/", "--")
-smolvlm_picture_description = PictureDescriptionVlmOptions(
-    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
-)
-# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
-granite_picture_description = PictureDescriptionVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
-    prompt="What is shown in this image?",
-)
-# Define an enum for the backend options
-class PdfBackend(str, Enum):
-    """Enum of valid PDF backends."""
-    PYPDFIUM2 = "pypdfium2"
-    DLPARSE_V1 = "dlparse_v1"
-    DLPARSE_V2 = "dlparse_v2"
-# Define an enum for the ocr engines
-class OcrEngine(str, Enum):
-    """Enum of valid OCR engines."""
-    EASYOCR = "easyocr"
-    TESSERACT_CLI = "tesseract_cli"
-    TESSERACT = "tesseract"
-    OCRMAC = "ocrmac"
-    RAPIDOCR = "rapidocr"
-class PipelineOptions(BaseModel):
-    """Base pipeline options."""
-    create_legacy_output: bool = (
-        True  # This default will be set to False on a future version of docling
-    )
-    document_timeout: Optional[float] = None
-    accelerator_options: AcceleratorOptions = AcceleratorOptions()
-class PdfPipelineOptions(PipelineOptions):
-    """Options for the PDF pipeline."""
-    artifacts_path: Optional[Union[Path, str]] = None
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-    do_code_enrichment: bool = False  # True: perform code OCR
-    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
-    do_picture_classification: bool = False  # True: classify pictures in documents
-    do_picture_description: bool = False  # True: run describe pictures in documents
-    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[
-        EasyOcrOptions,
-        TesseractCliOcrOptions,
-        TesseractOcrOptions,
-        OcrMacOptions,
-        RapidOcrOptions,
-    ] = Field(EasyOcrOptions(), discriminator="kind")
-    picture_description_options: Annotated[
-        Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
-        Field(discriminator="kind"),
-    ] = smolvlm_picture_description
-    images_scale: float = 1.0
-    generate_page_images: bool = False
-    generate_picture_images: bool = False
-    generate_table_images: bool = Field(
-        default=False,
-        deprecated=(
-            "Field `generate_table_images` is deprecated. "
-            "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
-            "before conversion and then use the `TableItem.get_image` function."
-        ),
-    )

Paper2Video/src/evaluation/PresentQuiz/docling/datamodel/settings.py DELETED Viewed

@@ -1,67 +0,0 @@
-import sys
-from pathlib import Path
-from typing import Annotated, Tuple
-from pydantic import BaseModel, PlainValidator
-from pydantic_settings import BaseSettings, SettingsConfigDict
-def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
-    if v[0] < 1 or v[1] < v[0]:
-        raise ValueError(
-            "Invalid page range: start must be ≥ 1 and end must be ≥ start."
-        )
-    return v
-PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
-DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
-class DocumentLimits(BaseModel):
-    max_num_pages: int = sys.maxsize
-    max_file_size: int = sys.maxsize
-    page_range: PageRange = DEFAULT_PAGE_RANGE
-class BatchConcurrencySettings(BaseModel):
-    doc_batch_size: int = 2
-    doc_batch_concurrency: int = 2
-    page_batch_size: int = 4
-    page_batch_concurrency: int = 2
-    elements_batch_size: int = 16
-    # doc_batch_size: int = 1
-    # doc_batch_concurrency: int = 1
-    # page_batch_size: int = 1
-    # page_batch_concurrency: int = 1
-    # model_concurrency: int = 2
-    # To force models into single core: export OMP_NUM_THREADS=1
-class DebugSettings(BaseModel):
-    visualize_cells: bool = False
-    visualize_ocr: bool = False
-    visualize_layout: bool = False
-    visualize_raw_layout: bool = False
-    visualize_tables: bool = False
-    profile_pipeline_timings: bool = False
-    # Path used to output debug information.
-    debug_output_path: str = str(Path.cwd() / "debug")
-class AppSettings(BaseSettings):
-    model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
-    perf: BatchConcurrencySettings
-    debug: DebugSettings
-    cache_dir: Path = Path.home() / ".cache" / "docling"
-settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

Paper2Video/src/evaluation/PresentQuiz/docling/document_converter.py DELETED Viewed

@@ -1,348 +0,0 @@
-import logging
-import math
-import sys
-import time
-from functools import partial
-from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
-from pydantic import BaseModel, ConfigDict, model_validator, validate_call
-from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.html_backend import HTMLDocumentBackend
-from docling.backend.json.docling_json_backend import DoclingJSONBackend
-from docling.backend.md_backend import MarkdownDocumentBackend
-from docling.backend.msexcel_backend import MsExcelDocumentBackend
-from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
-from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
-from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
-from docling.datamodel.base_models import (
-    ConversionStatus,
-    DoclingComponentType,
-    DocumentStream,
-    ErrorItem,
-    InputFormat,
-)
-from docling.datamodel.document import (
-    ConversionResult,
-    InputDocument,
-    _DocumentConversionInput,
-)
-from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import (
-    DEFAULT_PAGE_RANGE,
-    DocumentLimits,
-    PageRange,
-    settings,
-)
-from docling.exceptions import ConversionError
-from docling.pipeline.base_pipeline import BasePipeline
-from docling.pipeline.simple_pipeline import SimplePipeline
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-from docling.utils.utils import chunkify
-_log = logging.getLogger(__name__)
-class FormatOption(BaseModel):
-    pipeline_cls: Type[BasePipeline]
-    pipeline_options: Optional[PipelineOptions] = None
-    backend: Type[AbstractDocumentBackend]
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    @model_validator(mode="after")
-    def set_optional_field_default(self) -> "FormatOption":
-        if self.pipeline_options is None:
-            self.pipeline_options = self.pipeline_cls.get_default_options()
-        return self
-class ExcelFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
-class WordFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
-class PowerpointFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
-class MarkdownFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
-class AsciiDocFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = AsciiDocBackend
-class HTMLFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
-class PatentUsptoFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
-class XMLPubMedFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
-class ImageFormatOption(FormatOption):
-    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
-class PdfFormatOption(FormatOption):
-    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
-def _get_default_option(format: InputFormat) -> FormatOption:
-    format_to_default_options = {
-        InputFormat.XLSX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
-        ),
-        InputFormat.DOCX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
-        ),
-        InputFormat.PPTX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
-        ),
-        InputFormat.MD: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
-        ),
-        InputFormat.ASCIIDOC: FormatOption(
-            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
-        ),
-        InputFormat.HTML: FormatOption(
-            pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
-        ),
-        InputFormat.XML_USPTO: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
-        ),
-        InputFormat.XML_PUBMED: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
-        ),
-        InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
-        ),
-        InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
-        ),
-        InputFormat.JSON_DOCLING: FormatOption(
-            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
-        ),
-    }
-    if (options := format_to_default_options.get(format)) is not None:
-        return options
-    else:
-        raise RuntimeError(f"No default options configured for {format}")
-class DocumentConverter:
-    _default_download_filename = "file"
-    def __init__(
-        self,
-        allowed_formats: Optional[List[InputFormat]] = None,
-        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
-    ):
-        self.allowed_formats = (
-            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
-        )
-        self.format_to_options = {
-            format: (
-                _get_default_option(format=format)
-                if (custom_option := (format_options or {}).get(format)) is None
-                else custom_option
-            )
-            for format in self.allowed_formats
-        }
-        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
-    def initialize_pipeline(self, format: InputFormat):
-        """Initialize the conversion pipeline for the selected format."""
-        pipeline = self._get_pipeline(doc_format=format)
-        if pipeline is None:
-            raise ConversionError(
-                f"No pipeline could be initialized for format {format}"
-            )
-    @validate_call(config=ConfigDict(strict=True))
-    def convert(
-        self,
-        source: Union[Path, str, DocumentStream],  # TODO review naming
-        headers: Optional[Dict[str, str]] = None,
-        raises_on_error: bool = True,
-        max_num_pages: int = sys.maxsize,
-        max_file_size: int = sys.maxsize,
-        page_range: PageRange = DEFAULT_PAGE_RANGE,
-    ) -> ConversionResult:
-        all_res = self.convert_all(
-            source=[source],
-            raises_on_error=raises_on_error,
-            max_num_pages=max_num_pages,
-            max_file_size=max_file_size,
-            headers=headers,
-            page_range=page_range,
-        )
-        return next(all_res)
-    @validate_call(config=ConfigDict(strict=True))
-    def convert_all(
-        self,
-        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
-        headers: Optional[Dict[str, str]] = None,
-        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
-        max_num_pages: int = sys.maxsize,
-        max_file_size: int = sys.maxsize,
-        page_range: PageRange = DEFAULT_PAGE_RANGE,
-    ) -> Iterator[ConversionResult]:
-        limits = DocumentLimits(
-            max_num_pages=max_num_pages,
-            max_file_size=max_file_size,
-            page_range=page_range,
-        )
-        conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source, limits=limits, headers=headers
-        )
-        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
-        had_result = False
-        for conv_res in conv_res_iter:
-            had_result = True
-            if raises_on_error and conv_res.status not in {
-                ConversionStatus.SUCCESS,
-                ConversionStatus.PARTIAL_SUCCESS,
-            }:
-                raise ConversionError(
-                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
-                )
-            else:
-                yield conv_res
-        if not had_result and raises_on_error:
-            raise ConversionError(
-                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
-            )
-    def _convert(
-        self, conv_input: _DocumentConversionInput, raises_on_error: bool
-    ) -> Iterator[ConversionResult]:
-        start_time = time.monotonic()
-        for input_batch in chunkify(
-            conv_input.docs(self.format_to_options),
-            settings.perf.doc_batch_size,  # pass format_options
-        ):
-            _log.info(f"Going to convert document batch...")
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
-            for item in map(
-                partial(self._process_document, raises_on_error=raises_on_error),
-                input_batch,
-            ):
-                elapsed = time.monotonic() - start_time
-                start_time = time.monotonic()
-                _log.info(
-                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                )
-                yield item
-    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
-        fopt = self.format_to_options.get(doc_format)
-        if fopt is None:
-            return None
-        else:
-            pipeline_class = fopt.pipeline_cls
-            pipeline_options = fopt.pipeline_options
-        if pipeline_options is None:
-            return None
-        # TODO this will ignore if different options have been defined for the same pipeline class.
-        if (
-            pipeline_class not in self.initialized_pipelines
-            or self.initialized_pipelines[pipeline_class].pipeline_options
-            != pipeline_options
-        ):
-            self.initialized_pipelines[pipeline_class] = pipeline_class(
-                pipeline_options=pipeline_options
-            )
-        return self.initialized_pipelines[pipeline_class]
-    def _process_document(
-        self, in_doc: InputDocument, raises_on_error: bool
-    ) -> ConversionResult:
-        valid = (
-            self.allowed_formats is not None and in_doc.format in self.allowed_formats
-        )
-        if valid:
-            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
-        else:
-            error_message = f"File format not allowed: {in_doc.file}"
-            if raises_on_error:
-                raise ConversionError(error_message)
-            else:
-                error_item = ErrorItem(
-                    component_type=DoclingComponentType.USER_INPUT,
-                    module_name="",
-                    error_message=error_message,
-                )
-                conv_res = ConversionResult(
-                    input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
-                )
-        return conv_res
-    def _execute_pipeline(
-        self, in_doc: InputDocument, raises_on_error: bool
-    ) -> ConversionResult:
-        if in_doc.valid:
-            pipeline = self._get_pipeline(in_doc.format)
-            if pipeline is not None:
-                conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
-            else:
-                if raises_on_error:
-                    raise ConversionError(
-                        f"No pipeline could be initialized for {in_doc.file}."
-                    )
-                else:
-                    conv_res = ConversionResult(
-                        input=in_doc,
-                        status=ConversionStatus.FAILURE,
-                    )
-        else:
-            if raises_on_error:
-                raise ConversionError(f"Input document {in_doc.file} is not valid.")
-            else:
-                # invalid doc or not of desired format
-                conv_res = ConversionResult(
-                    input=in_doc,
-                    status=ConversionStatus.FAILURE,
-                )
-                # TODO add error log why it failed.
-        return conv_res

Paper2Video/src/evaluation/PresentQuiz/docling/exceptions.py DELETED Viewed

@@ -1,6 +0,0 @@
-class BaseError(RuntimeError):
-    pass
-class ConversionError(BaseError):
-    pass

Paper2Video/src/evaluation/PresentQuiz/docling/models/__init__.py DELETED Viewed

File without changes

Paper2Video/src/evaluation/PresentQuiz/docling/models/base_model.py DELETED Viewed

@@ -1,87 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, Generic, Iterable, Optional
-from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
-from typing_extensions import TypeVar
-from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.settings import settings
-class BasePageModel(ABC):
-    @abstractmethod
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        pass
-EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
-class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
-    elements_batch_size: int = settings.perf.elements_batch_size
-    @abstractmethod
-    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
-        pass
-    @abstractmethod
-    def prepare_element(
-        self, conv_res: ConversionResult, element: NodeItem
-    ) -> Optional[EnrichElementT]:
-        pass
-    @abstractmethod
-    def __call__(
-        self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
-    ) -> Iterable[NodeItem]:
-        pass
-class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
-    def prepare_element(
-        self, conv_res: ConversionResult, element: NodeItem
-    ) -> Optional[NodeItem]:
-        if self.is_processable(doc=conv_res.document, element=element):
-            return element
-        return None
-class BaseItemAndImageEnrichmentModel(
-    GenericEnrichmentModel[ItemAndImageEnrichmentElement]
-):
-    images_scale: float
-    expansion_factor: float = 0.0
-    def prepare_element(
-        self, conv_res: ConversionResult, element: NodeItem
-    ) -> Optional[ItemAndImageEnrichmentElement]:
-        if not self.is_processable(doc=conv_res.document, element=element):
-            return None
-        assert isinstance(element, DocItem)
-        element_prov = element.prov[0]
-        bbox = element_prov.bbox
-        width = bbox.r - bbox.l
-        height = bbox.t - bbox.b
-        # TODO: move to a utility in the BoundingBox class
-        expanded_bbox = BoundingBox(
-            l=bbox.l - width * self.expansion_factor,
-            t=bbox.t + height * self.expansion_factor,
-            r=bbox.r + width * self.expansion_factor,
-            b=bbox.b - height * self.expansion_factor,
-            coord_origin=bbox.coord_origin,
-        )
-        page_ix = element_prov.page_no - 1
-        cropped_image = conv_res.pages[page_ix].get_image(
-            scale=self.images_scale, cropbox=expanded_bbox
-        )
-        return ItemAndImageEnrichmentElement(item=element, image=cropped_image)

Paper2Video/src/evaluation/PresentQuiz/docling/models/base_ocr_model.py DELETED Viewed

@@ -1,189 +0,0 @@
-import copy
-import logging
-from abc import abstractmethod
-from pathlib import Path
-from typing import Iterable, List
-import numpy as np
-from docling_core.types.doc import BoundingBox, CoordOrigin
-from PIL import Image, ImageDraw
-from rtree import index
-from scipy.ndimage import binary_dilation, find_objects, label
-from docling.datamodel.base_models import Cell, OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import OcrOptions
-from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
-_log = logging.getLogger(__name__)
-class BaseOcrModel(BasePageModel):
-    def __init__(self, enabled: bool, options: OcrOptions):
-        self.enabled = enabled
-        self.options = options
-    # Computes the optimum amount and coordinates of rectangles to OCR on a given page
-    def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
-        BITMAP_COVERAGE_TRESHOLD = 0.75
-        assert page.size is not None
-        def find_ocr_rects(size, bitmap_rects):
-            image = Image.new(
-                "1", (round(size.width), round(size.height))
-            )  # '1' mode is binary
-            # Draw all bitmap rects into a binary image
-            draw = ImageDraw.Draw(image)
-            for rect in bitmap_rects:
-                x0, y0, x1, y1 = rect.as_tuple()
-                x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
-                draw.rectangle([(x0, y0), (x1, y1)], fill=1)
-            np_image = np.array(image)
-            # Dilate the image by 10 pixels to merge nearby bitmap rectangles
-            structure = np.ones(
-                (20, 20)
-            )  # Create a 20x20 structure element (10 pixels in all directions)
-            np_image = binary_dilation(np_image > 0, structure=structure)
-            # Find the connected components
-            labeled_image, num_features = label(
-                np_image > 0
-            )  # Label black (0 value) regions
-            # Find enclosing bounding boxes for each connected component.
-            slices = find_objects(labeled_image)
-            bounding_boxes = [
-                BoundingBox(
-                    l=slc[1].start,
-                    t=slc[0].start,
-                    r=slc[1].stop - 1,
-                    b=slc[0].stop - 1,
-                    coord_origin=CoordOrigin.TOPLEFT,
-                )
-                for slc in slices
-            ]
-            # Compute area fraction on page covered by bitmaps
-            area_frac = np.sum(np_image > 0) / (size.width * size.height)
-            return (area_frac, bounding_boxes)  # fraction covered  # boxes
-        if page._backend is not None:
-            bitmap_rects = page._backend.get_bitmap_rects()
-        else:
-            bitmap_rects = []
-        coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
-        # return full-page rectangle if page is dominantly covered with bitmaps
-        if self.options.force_full_page_ocr or coverage > max(
-            BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
-        ):
-            return [
-                BoundingBox(
-                    l=0,
-                    t=0,
-                    r=page.size.width,
-                    b=page.size.height,
-                    coord_origin=CoordOrigin.TOPLEFT,
-                )
-            ]
-        # return individual rectangles if the bitmap coverage is above the threshold
-        elif coverage > self.options.bitmap_area_threshold:
-            return ocr_rects
-        else:  # overall coverage of bitmaps is too low, drop all bitmap rectangles.
-            return []
-    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
-        # Create R-tree index for programmatic cells
-        p = index.Property()
-        p.dimension = 2
-        idx = index.Index(properties=p)
-        for i, cell in enumerate(programmatic_cells):
-            idx.insert(i, cell.bbox.as_tuple())
-        def is_overlapping_with_existing_cells(ocr_cell):
-            # Query the R-tree to get overlapping rectangles
-            possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
-            return (
-                len(possible_matches_index) > 0
-            )  # this is a weak criterion but it works.
-        filtered_ocr_cells = [
-            rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
-        ]
-        return filtered_ocr_cells
-    def post_process_cells(self, ocr_cells, programmatic_cells):
-        r"""
-        Post-process the ocr and programmatic cells and return the final list of of cells
-        """
-        if self.options.force_full_page_ocr:
-            # If a full page OCR is forced, use only the OCR cells
-            cells = [
-                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
-                for c_ocr in ocr_cells
-            ]
-            return cells
-        ## Remove OCR cells which overlap with programmatic cells.
-        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
-        programmatic_cells.extend(filtered_ocr_cells)
-        return programmatic_cells
-    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
-        image = copy.deepcopy(page.image)
-        scale_x = image.width / page.size.width
-        scale_y = image.height / page.size.height
-        draw = ImageDraw.Draw(image, "RGBA")
-        # Draw OCR rectangles as yellow filled rect
-        for rect in ocr_rects:
-            x0, y0, x1, y1 = rect.as_tuple()
-            y0 *= scale_x
-            y1 *= scale_y
-            x0 *= scale_x
-            x1 *= scale_x
-            shade_color = (255, 255, 0, 40)  # transparent yellow
-            draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
-        # Draw OCR and programmatic cells
-        for tc in page.cells:
-            x0, y0, x1, y1 = tc.bbox.as_tuple()
-            y0 *= scale_x
-            y1 *= scale_y
-            x0 *= scale_x
-            x1 *= scale_x
-            if y1 <= y0:
-                y1, y0 = y0, y1
-            color = "gray"
-            if isinstance(tc, OcrCell):
-                color = "magenta"
-            draw.rectangle([(x0, y0), (x1, y1)], outline=color)
-        if show:
-            image.show()
-        else:
-            out_path: Path = (
-                Path(settings.debug.debug_output_path)
-                / f"debug_{conv_res.input.file.stem}"
-            )
-            out_path.mkdir(parents=True, exist_ok=True)
-            out_file = out_path / f"ocr_page_{page.page_no:05}.png"
-            image.save(str(out_file), format="png")
-    @abstractmethod
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        pass

Paper2Video/src/evaluation/PresentQuiz/docling/models/code_formula_model.py DELETED Viewed

@@ -1,251 +0,0 @@
-import re
-from pathlib import Path
-from typing import Iterable, List, Literal, Optional, Tuple, Union
-import numpy as np
-from docling_core.types.doc import (
-    CodeItem,
-    DocItemLabel,
-    DoclingDocument,
-    NodeItem,
-    TextItem,
-)
-from docling_core.types.doc.labels import CodeLanguageLabel
-from PIL import Image
-from pydantic import BaseModel
-from docling.datamodel.base_models import ItemAndImageEnrichmentElement
-from docling.datamodel.pipeline_options import AcceleratorOptions
-from docling.models.base_model import BaseItemAndImageEnrichmentModel
-from docling.utils.accelerator_utils import decide_device
-class CodeFormulaModelOptions(BaseModel):
-    """
-    Configuration options for the CodeFormulaModel.
-    Attributes
-    ----------
-    kind : str
-        Type of the model. Fixed value "code_formula".
-    do_code_enrichment : bool
-        True if code enrichment is enabled, False otherwise.
-    do_formula_enrichment : bool
-        True if formula enrichment is enabled, False otherwise.
-    """
-    kind: Literal["code_formula"] = "code_formula"
-    do_code_enrichment: bool = True
-    do_formula_enrichment: bool = True
-class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
-    """
-    Model for processing and enriching documents with code and formula predictions.
-    Attributes
-    ----------
-    enabled : bool
-        True if the model is enabled, False otherwise.
-    options : CodeFormulaModelOptions
-        Configuration options for the CodeFormulaModel.
-    code_formula_model : CodeFormulaPredictor
-        The predictor model for code and formula processing.
-    Methods
-    -------
-    __init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
-        Initializes the CodeFormulaModel with the given configuration options.
-    is_processable(self, doc, element)
-        Determines if a given element in a document can be processed by the model.
-    __call__(self, doc, element_batch)
-        Processes the given batch of elements and enriches them with predictions.
-    """
-    _model_repo_folder = "ds4sd--CodeFormula"
-    elements_batch_size = 5
-    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
-    expansion_factor = 0.03
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        options: CodeFormulaModelOptions,
-        accelerator_options: AcceleratorOptions,
-    ):
-        """
-        Initializes the CodeFormulaModel with the given configuration.
-        Parameters
-        ----------
-        enabled : bool
-            True if the model is enabled, False otherwise.
-        artifacts_path : Path
-            Path to the directory containing the model artifacts.
-        options : CodeFormulaModelOptions
-            Configuration options for the model.
-        accelerator_options : AcceleratorOptions
-            Options specifying the device and number of threads for acceleration.
-        """
-        self.enabled = enabled
-        self.options = options
-        if self.enabled:
-            device = decide_device(accelerator_options.device)
-            from docling_ibm_models.code_formula_model.code_formula_predictor import (
-                CodeFormulaPredictor,
-            )
-            if artifacts_path is None:
-                artifacts_path = self.download_models()
-            else:
-                artifacts_path = artifacts_path / self._model_repo_folder
-            self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=str(artifacts_path),
-                device=device,
-                num_threads=accelerator_options.num_threads,
-            )
-    @staticmethod
-    def download_models(
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id="ds4sd/CodeFormula",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v1.0.1",
-        )
-        return Path(download_path)
-    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
-        """
-        Determines if a given element in a document can be processed by the model.
-        Parameters
-        ----------
-        doc : DoclingDocument
-            The document being processed.
-        element : NodeItem
-            The element within the document to check.
-        Returns
-        -------
-        bool
-            True if the element can be processed, False otherwise.
-        """
-        return self.enabled and (
-            (isinstance(element, CodeItem) and self.options.do_code_enrichment)
-            or (
-                isinstance(element, TextItem)
-                and element.label == DocItemLabel.FORMULA
-                and self.options.do_formula_enrichment
-            )
-        )
-    def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
-        """Extracts a programming language from the beginning of a string.
-        This function checks if the input string starts with a pattern of the form
-        ``<_some_language_>``. If it does, it extracts the language string and returns
-        a tuple of (remainder, language). Otherwise, it returns the original string
-        and `None`.
-        Args:
-            input_string (str): The input string, which may start with ``<_language_>``.
-        Returns:
-            Tuple[str, Optional[str]]:
-                A tuple where:
-                - The first element is either:
-                    - The remainder of the string (everything after ``<_language_>``),
-                    if a match is found; or
-                    - The original string, if no match is found.
-                - The second element is the extracted language if a match is found;
-                otherwise, `None`.
-        """
-        pattern = r"^<_([^>]+)_>\s*(.*)"
-        match = re.match(pattern, input_string, flags=re.DOTALL)
-        if match:
-            language = str(match.group(1))  # the captured programming language
-            remainder = str(match.group(2))  # everything after the <_language_>
-            return remainder, language
-        else:
-            return input_string, None
-    def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
-        """
-        Converts a string to a corresponding `CodeLanguageLabel` enum member.
-        If the provided string does not match any value in `CodeLanguageLabel`,
-        it defaults to `CodeLanguageLabel.UNKNOWN`.
-        Args:
-            value (Optional[str]): The string representation of the code language or None.
-        Returns:
-            CodeLanguageLabel: The corresponding enum member if the value is valid,
-            otherwise `CodeLanguageLabel.UNKNOWN`.
-        """
-        if not isinstance(value, str):
-            return CodeLanguageLabel.UNKNOWN
-        try:
-            return CodeLanguageLabel(value)
-        except ValueError:
-            return CodeLanguageLabel.UNKNOWN
-    def __call__(
-        self,
-        doc: DoclingDocument,
-        element_batch: Iterable[ItemAndImageEnrichmentElement],
-    ) -> Iterable[NodeItem]:
-        """
-        Processes the given batch of elements and enriches them with predictions.
-        Parameters
-        ----------
-        doc : DoclingDocument
-            The document being processed.
-        element_batch : Iterable[ItemAndImageEnrichmentElement]
-            A batch of elements to be processed.
-        Returns
-        -------
-        Iterable[Any]
-            An iterable of enriched elements.
-        """
-        if not self.enabled:
-            for element in element_batch:
-                yield element.item
-            return
-        labels: List[str] = []
-        images: List[Union[Image.Image, np.ndarray]] = []
-        elements: List[TextItem] = []
-        for el in element_batch:
-            assert isinstance(el.item, TextItem)
-            elements.append(el.item)
-            labels.append(el.item.label)
-            images.append(el.image)
-        outputs = self.code_formula_model.predict(images, labels)
-        for item, output in zip(elements, outputs):
-            if isinstance(item, CodeItem):
-                output, code_language = self._extract_code_language(output)
-                item.code_language = self._get_code_language_enum(code_language)
-            item.text = output
-            yield item

Paper2Video/src/evaluation/PresentQuiz/docling/models/document_picture_classifier.py DELETED Viewed

@@ -1,190 +0,0 @@
-from pathlib import Path
-from typing import Iterable, List, Literal, Optional, Tuple, Union
-import numpy as np
-from docling_core.types.doc import (
-    DoclingDocument,
-    NodeItem,
-    PictureClassificationClass,
-    PictureClassificationData,
-    PictureItem,
-)
-from PIL import Image
-from pydantic import BaseModel
-from docling.datamodel.pipeline_options import AcceleratorOptions
-from docling.models.base_model import BaseEnrichmentModel
-from docling.utils.accelerator_utils import decide_device
-class DocumentPictureClassifierOptions(BaseModel):
-    """
-    Options for configuring the DocumentPictureClassifier.
-    Attributes
-    ----------
-    kind : Literal["document_picture_classifier"]
-        Identifier for the type of classifier.
-    """
-    kind: Literal["document_picture_classifier"] = "document_picture_classifier"
-class DocumentPictureClassifier(BaseEnrichmentModel):
-    """
-    A model for classifying pictures in documents.
-    This class enriches document pictures with predicted classifications
-    based on a predefined set of classes.
-    Attributes
-    ----------
-    enabled : bool
-        Whether the classifier is enabled for use.
-    options : DocumentPictureClassifierOptions
-        Configuration options for the classifier.
-    document_picture_classifier : DocumentPictureClassifierPredictor
-        The underlying prediction model, loaded if the classifier is enabled.
-    Methods
-    -------
-    __init__(enabled, artifacts_path, options, accelerator_options)
-        Initializes the classifier with specified configurations.
-    is_processable(doc, element)
-        Checks if the given element can be processed by the classifier.
-    __call__(doc, element_batch)
-        Processes a batch of elements and adds classification annotations.
-    """
-    _model_repo_folder = "ds4sd--DocumentFigureClassifier"
-    images_scale = 2
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        options: DocumentPictureClassifierOptions,
-        accelerator_options: AcceleratorOptions,
-    ):
-        """
-        Initializes the DocumentPictureClassifier.
-        Parameters
-        ----------
-        enabled : bool
-            Indicates whether the classifier is enabled.
-        artifacts_path : Optional[Union[Path, str]],
-            Path to the directory containing model artifacts.
-        options : DocumentPictureClassifierOptions
-            Configuration options for the classifier.
-        accelerator_options : AcceleratorOptions
-            Options for configuring the device and parallelism.
-        """
-        self.enabled = enabled
-        self.options = options
-        if self.enabled:
-            device = decide_device(accelerator_options.device)
-            from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
-                DocumentFigureClassifierPredictor,
-            )
-            if artifacts_path is None:
-                artifacts_path = self.download_models()
-            else:
-                artifacts_path = artifacts_path / self._model_repo_folder
-            self.document_picture_classifier = DocumentFigureClassifierPredictor(
-                artifacts_path=str(artifacts_path),
-                device=device,
-                num_threads=accelerator_options.num_threads,
-            )
-    @staticmethod
-    def download_models(
-        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id="ds4sd/DocumentFigureClassifier",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v1.0.0",
-        )
-        return Path(download_path)
-    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
-        """
-        Determines if the given element can be processed by the classifier.
-        Parameters
-        ----------
-        doc : DoclingDocument
-            The document containing the element.
-        element : NodeItem
-            The element to be checked.
-        Returns
-        -------
-        bool
-            True if the element is a PictureItem and processing is enabled; False otherwise.
-        """
-        return self.enabled and isinstance(element, PictureItem)
-    def __call__(
-        self,
-        doc: DoclingDocument,
-        element_batch: Iterable[NodeItem],
-    ) -> Iterable[NodeItem]:
-        """
-        Processes a batch of elements and enriches them with classification predictions.
-        Parameters
-        ----------
-        doc : DoclingDocument
-            The document containing the elements to be processed.
-        element_batch : Iterable[NodeItem]
-            A batch of pictures to classify.
-        Returns
-        -------
-        Iterable[NodeItem]
-            An iterable of NodeItem objects after processing. The field
-            'data.classification' is added containing the classification for each picture.
-        """
-        if not self.enabled:
-            for element in element_batch:
-                yield element
-            return
-        images: List[Union[Image.Image, np.ndarray]] = []
-        elements: List[PictureItem] = []
-        for el in element_batch:
-            assert isinstance(el, PictureItem)
-            elements.append(el)
-            img = el.get_image(doc)
-            assert img is not None
-            images.append(img)
-        outputs = self.document_picture_classifier.predict(images)
-        for element, output in zip(elements, outputs):
-            element.annotations.append(
-                PictureClassificationData(
-                    provenance="DocumentPictureClassifier",
-                    predicted_classes=[
-                        PictureClassificationClass(
-                            class_name=pred[0],
-                            confidence=pred[1],
-                        )
-                        for pred in output
-                    ],
-                )
-            )
-            yield element

Paper2Video/src/evaluation/PresentQuiz/docling/models/ds_glm_model.py DELETED Viewed

@@ -1,386 +0,0 @@
-import copy
-import random
-from pathlib import Path
-from typing import List, Union
-from deepsearch_glm.andromeda_nlp import nlp_model
-from docling_core.types.doc import (
-    BoundingBox,
-    CoordOrigin,
-    DocItemLabel,
-    DoclingDocument,
-)
-from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
-from docling_core.types.legacy_doc.base import (
-    Figure,
-    PageDimensions,
-    PageReference,
-    Prov,
-    Ref,
-)
-from docling_core.types.legacy_doc.base import Table as DsSchemaTable
-from docling_core.types.legacy_doc.base import TableCell
-from docling_core.types.legacy_doc.document import BaseText
-from docling_core.types.legacy_doc.document import (
-    CCSDocumentDescription as DsDocumentDescription,
-)
-from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
-from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from PIL import ImageDraw
-from pydantic import BaseModel, ConfigDict, TypeAdapter
-from docling.datamodel.base_models import (
-    Cluster,
-    ContainerElement,
-    FigureElement,
-    Table,
-    TextElement,
-)
-from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
-from docling.datamodel.settings import settings
-from docling.utils.glm_utils import to_docling_document
-from docling.utils.profiling import ProfilingScope, TimeRecorder
-from docling.utils.utils import create_hash
-class GlmOptions(BaseModel):
-    model_config = ConfigDict(protected_namespaces=())
-    model_names: str = ""  # e.g. "language;term;reference"
-class GlmModel:
-    def __init__(self, options: GlmOptions):
-        self.options = options
-        self.model = nlp_model(loglevel="error", text_ordering=True)
-    def _to_legacy_document(self, conv_res) -> DsDocument:
-        title = ""
-        desc: DsDocumentDescription = DsDocumentDescription(logs=[])
-        page_hashes = [
-            PageReference(
-                hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
-                page=p.page_no + 1,
-                model="default",
-            )
-            for p in conv_res.pages
-        ]
-        file_info = DsFileInfoObject(
-            filename=conv_res.input.file.name,
-            document_hash=conv_res.input.document_hash,
-            num_pages=conv_res.input.page_count,
-            page_hashes=page_hashes,
-        )
-        main_text: List[Union[Ref, BaseText]] = []
-        page_headers: List[Union[Ref, BaseText]] = []
-        page_footers: List[Union[Ref, BaseText]] = []
-        tables: List[DsSchemaTable] = []
-        figures: List[Figure] = []
-        page_no_to_page = {p.page_no: p for p in conv_res.pages}
-        for element in conv_res.assembled.body:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
-            if isinstance(element, TextElement):
-                main_text.append(
-                    BaseText(
-                        text=element.text,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, len(element.text)],
-                            )
-                        ],
-                    )
-                )
-            elif isinstance(element, Table):
-                index = len(tables)
-                ref_str = f"#/tables/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-                # Initialise empty table data grid (only empty cells)
-                table_data = [
-                    [
-                        TableCell(
-                            text="",
-                            # bbox=[0,0,0,0],
-                            spans=[[i, j]],
-                            obj_type="body",
-                        )
-                        for j in range(element.num_cols)
-                    ]
-                    for i in range(element.num_rows)
-                ]
-                # Overwrite cells in table data for which there is actual cell content.
-                for cell in element.table_cells:
-                    for i in range(
-                        min(cell.start_row_offset_idx, element.num_rows),
-                        min(cell.end_row_offset_idx, element.num_rows),
-                    ):
-                        for j in range(
-                            min(cell.start_col_offset_idx, element.num_cols),
-                            min(cell.end_col_offset_idx, element.num_cols),
-                        ):
-                            celltype = "body"
-                            if cell.column_header:
-                                celltype = "col_header"
-                            elif cell.row_header:
-                                celltype = "row_header"
-                            elif cell.row_section:
-                                celltype = "row_section"
-                            def make_spans(cell):
-                                for rspan in range(
-                                    min(cell.start_row_offset_idx, element.num_rows),
-                                    min(cell.end_row_offset_idx, element.num_rows),
-                                ):
-                                    for cspan in range(
-                                        min(
-                                            cell.start_col_offset_idx, element.num_cols
-                                        ),
-                                        min(cell.end_col_offset_idx, element.num_cols),
-                                    ):
-                                        yield [rspan, cspan]
-                            spans = list(make_spans(cell))
-                            if cell.bbox is not None:
-                                bbox = cell.bbox.to_bottom_left_origin(
-                                    page_no_to_page[element.page_no].size.height
-                                ).as_tuple()
-                            else:
-                                bbox = None
-                            table_data[i][j] = TableCell(
-                                text=cell.text,
-                                bbox=bbox,
-                                # col=j,
-                                # row=i,
-                                spans=spans,
-                                obj_type=celltype,
-                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
-                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
-                            )
-                tables.append(
-                    DsSchemaTable(
-                        num_cols=element.num_cols,
-                        num_rows=element.num_rows,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        data=table_data,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                    )
-                )
-            elif isinstance(element, FigureElement):
-                index = len(figures)
-                ref_str = f"#/figures/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-                figures.append(
-                    Figure(
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        payload={
-                            "children": TypeAdapter(List[Cluster]).dump_python(
-                                element.cluster.children
-                            )
-                        },  # hack to channel child clusters through GLM
-                    )
-                )
-            elif isinstance(element, ContainerElement):
-                main_text.append(
-                    BaseText(
-                        text="",
-                        payload={
-                            "children": TypeAdapter(List[Cluster]).dump_python(
-                                element.cluster.children
-                            )
-                        },  # hack to channel child clusters through GLM
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                    )
-                )
-        # We can throw in headers and footers at the end of the legacy doc
-        # since the reading-order will re-sort it later.
-        for element in conv_res.assembled.headers:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
-            if isinstance(element, TextElement):
-                tel = BaseText(
-                    text=element.text,
-                    obj_type=layout_label_to_ds_type.get(element.label),
-                    name=element.label,
-                    prov=[
-                        Prov(
-                            bbox=target_bbox,
-                            page=element.page_no + 1,
-                            span=[0, len(element.text)],
-                        )
-                    ],
-                )
-                if element.label == DocItemLabel.PAGE_HEADER:
-                    index = len(page_headers)
-                    ref_str = f"#/page-headers/{index}"
-                    main_text.append(
-                        Ref(
-                            name=element.label,
-                            obj_type=layout_label_to_ds_type.get(element.label),
-                            ref=ref_str,
-                        ),
-                    )
-                    page_headers.append(tel)
-                elif element.label == DocItemLabel.PAGE_FOOTER:
-                    index = len(page_footers)
-                    ref_str = f"#/page-footers/{index}"
-                    main_text.append(
-                        Ref(
-                            name=element.label,
-                            obj_type=layout_label_to_ds_type.get(element.label),
-                            ref=ref_str,
-                        ),
-                    )
-                    page_footers.append(tel)
-        page_dimensions = [
-            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
-            for p in conv_res.pages
-            if p.size is not None
-        ]
-        ds_doc: DsDocument = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-            page_headers=page_headers,
-            page_footers=page_footers,
-        )
-        return ds_doc
-    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
-        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
-            ds_doc = self._to_legacy_document(conv_res)
-            ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
-            glm_doc = self.model.apply_on_doc(ds_doc_dict)
-            docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-            1 == 1
-        # DEBUG code:
-        def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
-            clusters_to_draw = []
-            image = copy.deepcopy(conv_res.pages[page_no].image)
-            for ix, elem in enumerate(ds_document.main_text):
-                if isinstance(elem, BaseText):
-                    prov = elem.prov[0]  # type: ignore
-                elif isinstance(elem, Ref):
-                    _, arr, index = elem.ref.split("/")
-                    index = int(index)  # type: ignore
-                    if arr == "tables":
-                        prov = ds_document.tables[index].prov[0]
-                    elif arr == "figures":
-                        prov = ds_document.pictures[index].prov[0]
-                    else:
-                        prov = None
-                if prov and prov.page == page_no:
-                    clusters_to_draw.append(
-                        Cluster(
-                            id=ix,
-                            label=elem.name,
-                            bbox=BoundingBox.from_tuple(
-                                coord=prov.bbox,  # type: ignore
-                                origin=CoordOrigin.BOTTOMLEFT,
-                            ).to_top_left_origin(conv_res.pages[page_no].size.height),
-                        )
-                    )
-            draw = ImageDraw.Draw(image)
-            for c in clusters_to_draw:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                for tc in c.cells:  # [:1]:
-                    x0, y0, x1, y1 = tc.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            if show:
-                image.show()
-            else:
-                out_path: Path = (
-                    Path(settings.debug.debug_output_path)
-                    / f"debug_{conv_res.input.file.stem}"
-                )
-                out_path.mkdir(parents=True, exist_ok=True)
-                out_file = out_path / f"doc_page_{page_no:05}.png"
-                image.save(str(out_file), format="png")
-        # for item in ds_doc.page_dimensions:
-        #    page_no = item.page
-        #    draw_clusters_and_cells(ds_doc, page_no)
-        return docling_doc

Paper2Video/src/evaluation/PresentQuiz/docling/models/easyocr_model.py DELETED Viewed

@@ -1,177 +0,0 @@
-import logging
-import warnings
-import zipfile
-from pathlib import Path
-from typing import Iterable, List, Optional
-import numpy
-from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import Cell, OcrCell, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
-    EasyOcrOptions,
-)
-from docling.datamodel.settings import settings
-from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.profiling import TimeRecorder
-from docling.utils.utils import download_url_with_progress
-_log = logging.getLogger(__name__)
-class EasyOcrModel(BaseOcrModel):
-    _model_repo_folder = "EasyOcr"
-    def __init__(
-        self,
-        enabled: bool,
-        artifacts_path: Optional[Path],
-        options: EasyOcrOptions,
-        accelerator_options: AcceleratorOptions,
-    ):
-        super().__init__(enabled=enabled, options=options)
-        self.options: EasyOcrOptions
-        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-        if self.enabled:
-            try:
-                import easyocr
-            except ImportError:
-                raise ImportError(
-                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
-                    "Alternatively, Docling has support for other OCR engines. See the documentation."
-                )
-            if self.options.use_gpu is None:
-                device = decide_device(accelerator_options.device)
-                # Enable easyocr GPU if running on CUDA, MPS
-                use_gpu = any(
-                    [
-                        device.startswith(x)
-                        for x in [
-                            AcceleratorDevice.CUDA.value,
-                            AcceleratorDevice.MPS.value,
-                        ]
-                    ]
-                )
-            else:
-                warnings.warn(
-                    "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
-                    "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
-                    "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
-                )
-                use_gpu = self.options.use_gpu
-            download_enabled = self.options.download_enabled
-            model_storage_directory = self.options.model_storage_directory
-            if artifacts_path is not None and model_storage_directory is None:
-                download_enabled = False
-                model_storage_directory = str(artifacts_path / self._model_repo_folder)
-            self.reader = easyocr.Reader(
-                lang_list=self.options.lang,
-                gpu=use_gpu,
-                model_storage_directory=model_storage_directory,
-                recog_network=self.options.recog_network,
-                download_enabled=download_enabled,
-                verbose=False,
-            )
-    @staticmethod
-    def download_models(
-        detection_models: List[str] = ["craft"],
-        recognition_models: List[str] = ["english_g2", "latin_g2"],
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
-        from easyocr.config import detection_models as det_models_dict
-        from easyocr.config import recognition_models as rec_models_dict
-        if local_dir is None:
-            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
-        local_dir.mkdir(parents=True, exist_ok=True)
-        # Collect models to download
-        download_list = []
-        for model_name in detection_models:
-            if model_name in det_models_dict:
-                download_list.append(det_models_dict[model_name])
-        for model_name in recognition_models:
-            if model_name in rec_models_dict["gen2"]:
-                download_list.append(rec_models_dict["gen2"][model_name])
-        # Download models
-        for model_details in download_list:
-            buf = download_url_with_progress(model_details["url"], progress=progress)
-            with zipfile.ZipFile(buf, "r") as zip_ref:
-                zip_ref.extractall(local_dir)
-        return local_dir
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        if not self.enabled:
-            yield from page_batch
-            return
-        for page in page_batch:
-            assert page._backend is not None
-            if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "ocr"):
-                    ocr_rects = self.get_ocr_rects(page)
-                    all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
-                        # Skip zero area boxes
-                        if ocr_rect.area() == 0:
-                            continue
-                        high_res_image = page._backend.get_page_image(
-                            scale=self.scale, cropbox=ocr_rect
-                        )
-                        im = numpy.array(high_res_image)
-                        result = self.reader.readtext(im)
-                        del high_res_image
-                        del im
-                        cells = [
-                            OcrCell(
-                                id=ix,
-                                text=line[1],
-                                confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
-                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
-                                ),
-                            )
-                            for ix, line in enumerate(result)
-                            if line[2] >= self.options.confidence_threshold
-                        ]
-                        all_ocr_cells.extend(cells)
-                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
-                # DEBUG code:
-                if settings.debug.visualize_ocr:
-                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
-                yield page

Paper2Video/src/evaluation/PresentQuiz/docling/models/layout_model.py DELETED Viewed

@@ -1,197 +0,0 @@
-import copy
-import logging
-import warnings
-from pathlib import Path
-from typing import Iterable, Optional, Union
-from docling_core.types.doc import DocItemLabel
-from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
-from PIL import Image
-from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
-from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorOptions
-from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
-from docling.utils.layout_postprocessor import LayoutPostprocessor
-from docling.utils.profiling import TimeRecorder
-from docling.utils.visualization import draw_clusters
-_log = logging.getLogger(__name__)
-class LayoutModel(BasePageModel):
-    _model_repo_folder = "ds4sd--docling-models"
-    _model_path = "model_artifacts/layout"
-    TEXT_ELEM_LABELS = [
-        DocItemLabel.TEXT,
-        DocItemLabel.FOOTNOTE,
-        DocItemLabel.CAPTION,
-        DocItemLabel.CHECKBOX_UNSELECTED,
-        DocItemLabel.CHECKBOX_SELECTED,
-        DocItemLabel.SECTION_HEADER,
-        DocItemLabel.PAGE_HEADER,
-        DocItemLabel.PAGE_FOOTER,
-        DocItemLabel.CODE,
-        DocItemLabel.LIST_ITEM,
-        DocItemLabel.FORMULA,
-    ]
-    PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
-    TABLE_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
-    FIGURE_LABEL = DocItemLabel.PICTURE
-    FORMULA_LABEL = DocItemLabel.FORMULA
-    CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
-    def __init__(
-        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
-    ):
-        device = decide_device(accelerator_options.device)
-        if artifacts_path is None:
-            artifacts_path = self.download_models() / self._model_path
-        else:
-            # will become the default in the future
-            if (artifacts_path / self._model_repo_folder).exists():
-                artifacts_path = (
-                    artifacts_path / self._model_repo_folder / self._model_path
-                )
-            elif (artifacts_path / self._model_path).exists():
-                warnings.warn(
-                    "The usage of artifacts_path containing directly "
-                    f"{self._model_path} is deprecated. Please point "
-                    "the artifacts_path to the parent containing "
-                    f"the {self._model_repo_folder} folder.",
-                    DeprecationWarning,
-                    stacklevel=3,
-                )
-                artifacts_path = artifacts_path / self._model_path
-        self.layout_predictor = LayoutPredictor(
-            artifact_path=str(artifacts_path),
-            device=device,
-            num_threads=accelerator_options.num_threads,
-        )
-    @staticmethod
-    def download_models(
-        local_dir: Optional[Path] = None,
-        force: bool = False,
-        progress: bool = False,
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        from huggingface_hub.utils import disable_progress_bars
-        if not progress:
-            disable_progress_bars()
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.1.0",
-        )
-        return Path(download_path)
-    def draw_clusters_and_cells_side_by_side(
-        self, conv_res, page, clusters, mode_prefix: str, show: bool = False
-    ):
-        """
-        Draws a page image side by side with clusters filtered into two categories:
-        - Left: Clusters excluding FORM, KEY_VALUE_REGION, and PICTURE.
-        - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
-        Includes label names and confidence scores for each cluster.
-        """
-        scale_x = page.image.width / page.size.width
-        scale_y = page.image.height / page.size.height
-        # Filter clusters for left and right images
-        exclude_labels = {
-            DocItemLabel.FORM,
-            DocItemLabel.KEY_VALUE_REGION,
-            DocItemLabel.PICTURE,
-        }
-        left_clusters = [c for c in clusters if c.label not in exclude_labels]
-        right_clusters = [c for c in clusters if c.label in exclude_labels]
-        # Create a deep copy of the original image for both sides
-        left_image = copy.deepcopy(page.image)
-        right_image = copy.deepcopy(page.image)
-        # Draw clusters on both images
-        draw_clusters(left_image, left_clusters, scale_x, scale_y)
-        draw_clusters(right_image, right_clusters, scale_x, scale_y)
-        # Combine the images side by side
-        combined_width = left_image.width * 2
-        combined_height = left_image.height
-        combined_image = Image.new("RGB", (combined_width, combined_height))
-        combined_image.paste(left_image, (0, 0))
-        combined_image.paste(right_image, (left_image.width, 0))
-        if show:
-            combined_image.show()
-        else:
-            out_path: Path = (
-                Path(settings.debug.debug_output_path)
-                / f"debug_{conv_res.input.file.stem}"
-            )
-            out_path.mkdir(parents=True, exist_ok=True)
-            out_file = out_path / f"{mode_prefix}_layout_page_{page.page_no:05}.png"
-            combined_image.save(str(out_file), format="png")
-    def __call__(
-        self, conv_res: ConversionResult, page_batch: Iterable[Page]
-    ) -> Iterable[Page]:
-        for page in page_batch:
-            assert page._backend is not None
-            if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "layout"):
-                    assert page.size is not None
-                    page_image = page.get_image(scale=1.0)
-                    assert page_image is not None
-                    clusters = []
-                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page_image)
-                    ):
-                        label = DocItemLabel(
-                            pred_item["label"]
-                            .lower()
-                            .replace(" ", "_")
-                            .replace("-", "_")
-                        )  # Temporary, until docling-ibm-model uses docling-core types
-                        cluster = Cluster(
-                            id=ix,
-                            label=label,
-                            confidence=pred_item["confidence"],
-                            bbox=BoundingBox.model_validate(pred_item),
-                            cells=[],
-                        )
-                        clusters.append(cluster)
-                    if settings.debug.visualize_raw_layout:
-                        self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
-                        )
-                    # Apply postprocessing
-                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page.cells, clusters, page.size
-                    ).postprocess()
-                    # processed_clusters, processed_cells = clusters, page.cells
-                    page.cells = processed_cells
-                    page.predictions.layout = LayoutPrediction(
-                        clusters=processed_clusters
-                    )
-                if settings.debug.visualize_layout:
-                    self.draw_clusters_and_cells_side_by_side(
-                        conv_res, page, processed_clusters, mode_prefix="postprocessed"
-                    )
-                yield page