| |
| from .gmft import convert_gmft |
| from .pypdf import convert_pypdf |
| from .smoldocling import convert_smoldocling |
| from .unstructured import convert_unstructured |
|
|
| __all__ = [ |
| "convert_smoldocling", |
| "convert_unstructured", |
| "convert_gmft", |
| "convert_pypdf", |
| ] |
|
|
| SUPPORTED_METHODS = [ |
| "SmolDocling", |
| "PyMuPDF", |
| "PyPDF", |
| "Unstructured", |
| "GMFT (table-only)", |
| ] |
| SUPPORTED_METHODS_METADATA = { |
| "Unstructured": { |
| "name": "Unstructured", |
| "description": "Open-Source Pre-Processing Tools for Unstructured Data.", |
| "url": "https://github.com/Unstructured-IO/unstructured", |
| "documentation": "https://docs.unstructured.io/welcome", |
| }, |
| "Marker": { |
| "name": "Marker", |
| "description": "Marker converts documents to markdown, JSON, and HTML quickly and accurately.", |
| "url": "https://github.com/VikParuchuri/marker", |
| "documentation": "https://github.com/VikParuchuri/marker", |
| }, |
| "MinerU": { |
| "name": "MinerU", |
| "description": "A high-quality tool for convert PDF to Markdown and JSON.", |
| "url": "https://github.com/opendatalab/MinerU", |
| "documentation": "https://github.com/opendatalab/MinerU", |
| }, |
| "Docling": { |
| "name": "Docling", |
| "description": "Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.", |
| "url": "https://github.com/DS4SD/docling", |
| "documentation": "https://ds4sd.github.io/docling/", |
| }, |
| "SmolDocling": { |
| "name": "SmolDocling", |
| "description": "SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.", |
| "url": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", |
| "documentation": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", |
| }, |
| "PyMuPDF": { |
| "name": "PyMuPDF", |
| "description": "PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.", |
| "url": "https://github.com/pymupdf/PyMuPDF", |
| "documentation": "https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html", |
| }, |
| "Gemini (API)": { |
| "name": "Gemini", |
| "description": "Using Gemini multimodal API to parse PDF to markdown.", |
| "url": None, |
| "documentation": "https://ai.google.dev/gemini-api/docs/document-processing?lang=python", |
| }, |
| "Img2Table (table-only)": { |
| "name": "Img2Table", |
| "description": "img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing.", |
| "url": "https://github.com/xavctn/img2table", |
| "documentation": "https://github.com/xavctn/img2table", |
| }, |
| "GMFT (table-only)": { |
| "name": "GMFT", |
| "description": "Lightweight, performant, deep table extraction.", |
| "url": "https://github.com/conjuncts/gmft", |
| "documentation": "https://github.com/conjuncts/gmft", |
| }, |
| "Sycamore": { |
| "name": "Sycamore", |
| "description": "Sycamore is an open source, AI-powered document processing engine for ETL, RAG, LLM-based applications, and analytics on unstructured data.", |
| "url": "https://github.com/aryn-ai/sycamore", |
| "documentation": "https://sycamore.readthedocs.io/en/stable/", |
| }, |
| "PyPDF": { |
| "name": "PyPDF", |
| "description": "PyPDF is a pure-Python PDF toolkit that can help you read, write, and manipulate PDF documents.", |
| "url": "https://github.com/py-pdf/pypdf", |
| "documentation": "https://pypdf.readthedocs.io/en/stable", |
| }, |
| } |
|
|