Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 20

Commit

93c1274

2 Parent(s): 333b95b 85e05d9

Merge pull request #472 from VikParuchuri/vik_dev

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/scripts.yml +29 -0
README.md +55 -10
benchmarks/table/scoring.py +109 -0
benchmarks/table/table.py +187 -0
chunk_convert.py +2 -20
convert.py +2 -115
convert_single.py +2 -41
marker/builders/document.py +6 -1
marker/builders/layout.py +37 -15
marker/builders/llm_layout.py +59 -44
marker/builders/ocr.py +7 -11
marker/config/parser.py +17 -0
marker/config/printer.py +42 -16
marker/converters/pdf.py +34 -27
marker/converters/table.py +49 -0
marker/models.py +34 -71
marker/processors/debug.py +2 -2
marker/processors/equation.py +4 -6
marker/processors/ignoretext.py +1 -3
marker/processors/llm/__init__.py +4 -13
marker/processors/llm/llm_complex.py +17 -11
marker/processors/llm/llm_equation.py +82 -0
marker/processors/llm/llm_form.py +57 -35
marker/processors/llm/llm_handwriting.py +86 -0
marker/processors/llm/llm_image_description.py +5 -2
marker/processors/llm/llm_table.py +55 -23
marker/processors/llm/llm_table_merge.py +318 -0
marker/processors/llm/llm_text.py +8 -5
marker/processors/llm/utils.py +5 -2
marker/processors/table.py +204 -44
marker/providers/__init__.py +13 -1
marker/providers/image.py +52 -0
marker/providers/pdf.py +21 -6
marker/providers/registry.py +12 -0
marker/renderers/__init__.py +14 -8
marker/renderers/html.py +16 -8
marker/renderers/json.py +3 -0
marker/renderers/markdown.py +84 -8
marker/schema/__init__.py +1 -0
marker/schema/blocks/__init__.py +1 -0
marker/schema/blocks/base.py +25 -5
marker/schema/blocks/basetable.py +39 -0
marker/schema/blocks/caption.py +2 -0
marker/schema/blocks/code.py +2 -1
marker/schema/blocks/complexregion.py +3 -2
marker/schema/blocks/equation.py +4 -3
marker/schema/blocks/figure.py +3 -2
marker/schema/blocks/footnote.py +1 -0
marker/schema/blocks/form.py +4 -15
marker/schema/blocks/handwriting.py +8 -0

.github/workflows/scripts.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: Test CLI scripts
+on: [push]
+env:
+  TORCH_DEVICE: "cpu"
+  OCR_ENGINE: "surya"
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install python dependencies
+        run: |
+          pip install poetry
+          poetry install
+      - name: Download benchmark data
+        run: |
+          wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
+          unzip -o benchmark_data.zip
+      - name: Test single script
+        run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
+      - name: Test convert script
+        run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 # Marker
-Marker converts PDFs to markdown, JSON, and HTML quickly and accurately.
-- Supports a wide range of documents
-- Supports all languages
 - Removes headers/footers/other artifacts
-- Formats tables, forms, and code blocks
 - Extracts and saves images along with the markdown
-- Converts equations to latex
 - Easily extensible with your own formatting and logic
 - Optionally boost accuracy with an LLM
 - Works on GPU, CPU, or MPS
@@ -63,11 +61,11 @@ There's a hosted API for marker available [here](https://www.datalab.to/):
 PDF is a tricky format, so marker will not always work perfectly.  Here are some known limitations that are on the roadmap to address:
 - Marker will only convert block equations
-- Tables are not always formatted 100% correctly - multiline cells are sometimes split into multiple rows.
 - Forms are not converted optimally
 - Very complex layouts, with nested tables and forms, may not work
-Note: Passing the `--use_llm` flag will mostly solve all of these issues.
 # Installation
@@ -84,7 +82,7 @@ pip install marker-pdf
 First, some configuration:
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
-- Some PDFs, even digital ones, have bad text in them.  Set the `force_ocr` flag on the CLI or via configuration to ensure your PDF runs through OCR.
 ## Interactive App
@@ -101,9 +99,12 @@ marker_gui
 marker_single /path/to/file.pdf
 ```
 Options:
 - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
 - `--output_format [markdown|json|html]`: Specify the format for the output results.
 - `--use_llm`: Uses an LLM to improve accuracy.  You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
 - `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
 - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
@@ -114,6 +115,7 @@ Options:
 - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
 - `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
 - `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
 The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you don't need OCR, marker can work with any language.
@@ -179,7 +181,7 @@ rendered = converter("FILEPATH")
 ### Extract blocks
-Each document consists of one or more pages.  Pages contain blocks, which can themselves contain other blocks.  It's possible to programatically manipulate these blocks.
 Here's an example of extracting all forms from a document:
@@ -197,6 +199,28 @@ forms = document.contained_blocks((BlockTypes.Form,))
 Look at the processors for more examples of extracting and manipulating blocks.
 # Output Formats
 ## Markdown
@@ -348,7 +372,7 @@ There are some settings that you may find useful if things aren't working the wa
 Pass the `debug` option to activate debug mode.  This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.
 # Benchmarks
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.  It's noisy, but at least directionally correct.
 **Speed**
@@ -371,6 +395,18 @@ Marker takes about 6GB of VRAM on average per task, so you can convert 8 documen
 ![Benchmark results](data/images/per_doc.png)
 ## Running your own benchmarks
 You can benchmark the performance of marker on your machine. Install marker manually with:
@@ -380,12 +416,21 @@ git clone https://github.com/VikParuchuri/marker.git
 poetry install
 ```
 Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
 ```shell
 python benchmarks/overall.py data/pdfs data/references report.json
 ```
 # Thanks
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):

 # Marker
+Marker converts PDFs and images to markdown, JSON, and HTML quickly and accurately.
+- Supports a range of documents in all languages
 - Removes headers/footers/other artifacts
+- Formats tables, forms, equations, links, and code blocks
 - Extracts and saves images along with the markdown
 - Easily extensible with your own formatting and logic
 - Optionally boost accuracy with an LLM
 - Works on GPU, CPU, or MPS
 PDF is a tricky format, so marker will not always work perfectly.  Here are some known limitations that are on the roadmap to address:
 - Marker will only convert block equations
+- Tables are not always formatted 100% correctly
 - Forms are not converted optimally
 - Very complex layouts, with nested tables and forms, may not work
+Note: Passing the `--use_llm` flag will mostly solve these issues.
 # Installation
 First, some configuration:
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
+- Some PDFs, even digital ones, have bad text in them.  Set the `force_ocr` flag on the CLI or via configuration to ensure your PDF runs through OCR, or the `strip_existing_ocr` to keep all digital text, and only strip out any existing OCR text.
 ## Interactive App
 marker_single /path/to/file.pdf
 ```
+You can pass in PDFs or images.
 Options:
 - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
 - `--output_format [markdown|json|html]`: Specify the format for the output results.
+- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
 - `--use_llm`: Uses an LLM to improve accuracy.  You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
 - `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
 - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
 - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
 - `--languages TEXT`: Optionally specify which languages to use for OCR processing. Accepts a comma-separated list. Example: `--languages "en,fr,de"` for English, French, and German.
 - `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
+- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`.  The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
 The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you don't need OCR, marker can work with any language.
 ### Extract blocks
+Each document consists of one or more pages.  Pages contain blocks, which can themselves contain other blocks.  It's possible to programmatically manipulate these blocks.
 Here's an example of extracting all forms from a document:
 Look at the processors for more examples of extracting and manipulating blocks.
+## Other converters
+You can also use other converters that define different conversion pipelines:
+### Extract tables
+The `TableConverter` will only convert and extract tables:
+```python
+from marker.converters.table import TableConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+converter = TableConverter(
+    artifact_dict=create_model_dict(),
+)
+rendered = converter("FILEPATH")
+text, _, images = text_from_rendered(rendered)
+```
+This takes all the same configuration as the PdfConverter.  You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
 # Output Formats
 ## Markdown
 Pass the `debug` option to activate debug mode.  This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.
 # Benchmarks
+## Overall PDF Conversion
 Benchmarking PDF extraction quality is hard.  I've created a test set by finding books and scientific papers that have a pdf version and a latex source.  I convert the latex to text, and compare the reference to the output of text extraction methods.  It's noisy, but at least directionally correct.
 **Speed**
 ![Benchmark results](data/images/per_doc.png)
+## Table Conversion
+Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:
+| Avg score | Total tables | use_llm |
+|-----------|--------------|---------|
+| 0.824     | 54           | False   |
+| 0.873     | 54           | True    |
+The `--use_llm` flag can significantly improve table recognition performance, as you can see.
+We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged).
 ## Running your own benchmarks
 You can benchmark the performance of marker on your machine. Install marker manually with:
 poetry install
 ```
+### Overall PDF Conversion
 Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
 ```shell
 python benchmarks/overall.py data/pdfs data/references report.json
 ```
+### Table Conversion
+The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
+```shell
+python benchmarks/table/table.py table_report.json --max_rows 1000
+```
 # Thanks
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):

benchmarks/table/scoring.py ADDED Viewed

	@@ -0,0 +1,109 @@

+""""
+TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
+"""
+import distance
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import html
+from collections import deque
+def wrap_table_html(table_html:str)->str:
+    return f'<html><body>{table_html}</body></html>'
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        # Sets self.name and self.children
+        super().__init__(tag, *children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+def tokenize(node):
+    """
+    Tokenizes table cells
+    """
+    global __tokens__
+    __tokens__.append('<%s>' % node.tag)
+    if node.text is not None:
+        __tokens__ += list(node.text)
+    for n in node.getchildren():
+        tokenize(n)
+    if node.tag != 'unk':
+        __tokens__.append('</%s>' % node.tag)
+    if node.tag != 'td' and node.tail is not None:
+            __tokens__ += list(node.tail)
+def tree_convert_html(node, convert_cell=False, parent=None):
+    """
+    Converts HTML tree to the format required by apted
+    """
+    global __tokens__
+    if node.tag == 'td':
+        if convert_cell:
+            __tokens__ = []
+            tokenize(node)
+            cell = __tokens__[1:-1].copy()
+        else:
+            cell = []
+        new_node = TableTree(node.tag,
+                             int(node.attrib.get('colspan', '1')),
+                             int(node.attrib.get('rowspan', '1')),
+                             cell, *deque())
+    else:
+        new_node = TableTree(node.tag, None, None, None, *deque())
+    if parent is not None:
+        parent.children.append(new_node)
+    if node.tag != 'td':
+        for n in node.getchildren():
+            tree_convert_html(n, convert_cell, new_node)
+    if parent is None:
+        return new_node
+def similarity_eval_html(pred, true, structure_only=False):
+    """
+    Computes TEDS score between the prediction and the ground truth of a given samples
+    """
+    pred, true = html.fromstring(pred), html.fromstring(true)
+    if pred.xpath('body/table') and true.xpath('body/table'):
+        pred = pred.xpath('body/table')[0]
+        true = true.xpath('body/table')[0]
+        n_nodes_pred = len(pred.xpath(".//*"))
+        n_nodes_true = len(true.xpath(".//*"))
+        tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
+        tree_true = tree_convert_html(true, convert_cell=not structure_only)
+        n_nodes = max(n_nodes_pred, n_nodes_true)
+        distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+        return 1.0 - (float(distance) / n_nodes)
+    else:
+        return 0.0

benchmarks/table/table.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+from typing import List
+import numpy as np
+from marker.renderers.json import JSONOutput, JSONBlockOutput
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+import base64
+import time
+import datasets
+from tqdm import tqdm
+import tempfile
+import click
+from tabulate import tabulate
+import json
+from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+from pypdfium2._helpers.misc import PdfiumError
+from marker.util import matrix_intersection_area
+from marker.config.parser import ConfigParser
+from marker.converters.table import TableConverter
+from marker.models import create_model_dict
+from scoring import wrap_table_html, similarity_eval_html
+def update_teds_score(result):
+    prediction, ground_truth = result['marker_table'], result['gt_table']
+    prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
+    score = similarity_eval_html(prediction, ground_truth)
+    result.update({'score':score})
+    return result
+def extract_tables(children: List[JSONBlockOutput]):
+    tables = []
+    for child in children:
+        if child.block_type == 'Table':
+            tables.append(child)
+        elif child.children:
+            tables.extend(extract_tables(child.children))
+    return tables
+@click.command(help="Benchmark Table to HTML Conversion")
+@click.argument("out_file", type=str)
+@click.option("--dataset", type=str, default="datalab-to/fintabnet-test", help="Dataset to use")
+@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
+@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
+@click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
+@click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
+def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm: bool, table_rec_batch_size: int | None):
+    models = create_model_dict()
+    config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
+    start = time.time()
+    dataset = datasets.load_dataset(dataset, split='train')
+    dataset = dataset.shuffle(seed=0)
+    iterations = len(dataset)
+    if max_rows is not None:
+        iterations = min(max_rows, len(dataset))
+    results = []
+    total_unaligned = 0
+    for i in tqdm(range(iterations), desc='Converting Tables'):
+        try:
+            row = dataset[i]
+            pdf_binary = base64.b64decode(row['pdf'])
+            gt_tables = row['tables']       #Already sorted by reading order, which is what marker returns
+            converter = TableConverter(
+                config=config_parser.generate_config_dict(),
+                artifact_dict=models,
+                processor_list=config_parser.get_processors(),
+                renderer=config_parser.get_renderer()
+            )
+            with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
+                temp_pdf_file.write(pdf_binary)
+                temp_pdf_file.seek(0)
+                tqdm.disable = True
+                marker_json = converter(temp_pdf_file.name).children
+                tqdm.disable = False
+            if len(marker_json) == 0 or len(gt_tables) == 0:
+                print(f'No tables detected, skipping...')
+                total_unaligned += len(gt_tables)
+                continue
+            marker_tables = extract_tables(marker_json)
+            marker_table_boxes = [table.bbox for table in marker_tables]
+            page_bbox = marker_json[0].bbox
+            # Normalize the bboxes
+            for bbox in marker_table_boxes:
+                bbox[0] = bbox[0] / page_bbox[2]
+                bbox[1] = bbox[1] / page_bbox[3]
+                bbox[2] = bbox[2] / page_bbox[2]
+                bbox[3] = bbox[3] / page_bbox[3]
+            gt_boxes = [table['normalized_bbox'] for table in gt_tables]
+            gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
+            marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
+            table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
+            aligned_tables = []
+            used_tables = set()
+            unaligned_tables = set()
+            for table_idx, alignment in enumerate(table_alignments):
+                try:
+                    max_area = np.max(alignment)
+                    aligned_idx = np.argmax(alignment)
+                except ValueError:
+                    # No alignment found
+                    unaligned_tables.add(table_idx)
+                    continue
+                if aligned_idx in used_tables:
+                    # Marker table already aligned with another gt table
+                    unaligned_tables.add(table_idx)
+                    continue
+                # Gt table doesn't align well with any marker table
+                gt_table_pct = gt_areas[table_idx] / max_area
+                if not .75 < gt_table_pct < 1.25:
+                    unaligned_tables.add(table_idx)
+                    continue
+                # Marker table doesn't align with gt table
+                marker_table_pct = marker_areas[aligned_idx] / max_area
+                if not .75 < marker_table_pct < 1.25:
+                    unaligned_tables.add(table_idx)
+                    continue
+                aligned_tables.append(
+                    (marker_tables[aligned_idx], gt_tables[table_idx])
+                )
+                used_tables.add(aligned_idx)
+            total_unaligned += len(unaligned_tables)
+            for marker_table, gt_table in aligned_tables:
+                gt_table_html = gt_table['html']
+                #marker wraps the table in <tbody> which fintabnet data doesn't
+                #Fintabnet doesn't use th tags, need to be replaced for fair comparison
+                marker_table_soup = BeautifulSoup(marker_table.html, 'html.parser')
+                marker_table_soup.find('tbody').unwrap()
+                for th_tag in marker_table_soup.find_all('th'):
+                    th_tag.name = 'td'
+                marker_table_html = str(marker_table_soup)
+                marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
+                results.append({
+                    "marker_table": marker_table_html,
+                    "gt_table": gt_table_html
+                })
+        except PdfiumError:
+            print('Broken PDF, Skipping...')
+            continue
+    print(f"Total time: {time.time() - start}.")
+    print(f"Could not align {total_unaligned} tables from fintabnet.")
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        results = list(
+            tqdm(
+                executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
+            )
+        )
+    avg_score = sum([r["score"] for r in results]) / len(results)
+    headers = ["Avg score", "Total tables"]
+    data = [f"{avg_score:.3f}", len(results)]
+    table = tabulate([data], headers=headers, tablefmt="github")
+    print(table)
+    print("Avg score computed by comparing marker predicted HTML with original HTML")
+    with open(out_file, "w+") as f:
+        json.dump(results, f, indent=2)
+if __name__ == '__main__':
+    main()

chunk_convert.py CHANGED Viewed

@@ -1,22 +1,4 @@
-import argparse
-import subprocess
-import pkg_resources
-def main():
-    parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
-    parser.add_argument("in_folder", help="Input folder with pdfs.")
-    parser.add_argument("out_folder", help="Output folder")
-    args = parser.parse_args()
-    script_path = pkg_resources.resource_filename(__name__, 'chunk_convert.sh')
-    # Construct the command
-    cmd = f"{script_path} {args.in_folder} {args.out_folder}"
-    # Execute the shell script
-    subprocess.run(cmd, shell=True, check=True)
 if __name__ == "__main__":
-    main()

+from marker.scripts import chunk_convert_cli
 if __name__ == "__main__":
+    chunk_convert_cli()

convert.py CHANGED Viewed

@@ -1,117 +1,4 @@
-import os
-os.environ["GRPC_VERBOSITY"] = "ERROR"
-os.environ["GLOG_minloglevel"] = "2"
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
-os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
-import math
-import traceback
-import click
-import torch.multiprocessing as mp
-from tqdm import tqdm
-from marker.config.parser import ConfigParser
-from marker.config.printer import CustomClickPrinter
-from marker.converters.pdf import PdfConverter
-from marker.logger import configure_logging
-from marker.models import create_model_dict
-from marker.output import output_exists, save_output
-from marker.settings import settings
-configure_logging()
-def worker_init(model_dict):
-    if model_dict is None:
-        model_dict = create_model_dict()
-    global model_refs
-    model_refs = model_dict
-def worker_exit():
-    global model_refs
-    del model_refs
-def process_single_pdf(args):
-    fpath, cli_options = args
-    config_parser = ConfigParser(cli_options)
-    out_folder = config_parser.get_output_folder(fpath)
-    base_name = config_parser.get_base_filename(fpath)
-    if cli_options.get('skip_existing') and output_exists(out_folder, base_name):
-        return
-    try:
-        converter = PdfConverter(
-            config=config_parser.generate_config_dict(),
-            artifact_dict=model_refs,
-            processor_list=config_parser.get_processors(),
-            renderer=config_parser.get_renderer()
-        )
-        rendered = converter(fpath)
-        out_folder = config_parser.get_output_folder(fpath)
-        save_output(rendered, out_folder, base_name)
-    except Exception as e:
-        print(f"Error converting {fpath}: {e}")
-        print(traceback.format_exc())
-@click.command(cls=CustomClickPrinter)
-@click.argument("in_folder", type=str)
-@ConfigParser.common_options
-@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
-@click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
-@click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
-@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
-@click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
-def main(in_folder: str, **kwargs):
-    in_folder = os.path.abspath(in_folder)
-    files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
-    files = [f for f in files if os.path.isfile(f)]
-    # Handle chunks if we're processing in parallel
-    # Ensure we get all files into a chunk
-    chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
-    start_idx = kwargs["chunk_idx"] * chunk_size
-    end_idx = start_idx + chunk_size
-    files_to_convert = files[start_idx:end_idx]
-    # Limit files converted if needed
-    if kwargs["max_files"]:
-        files_to_convert = files_to_convert[:kwargs["max_files"]]
-    # Disable nested multiprocessing
-    kwargs["disable_multiprocessing"] = True
-    total_processes = min(len(files_to_convert), kwargs["workers"])
-    try:
-        mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
-    except RuntimeError:
-        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
-    if settings.TORCH_DEVICE == "mps" or settings.TORCH_DEVICE_MODEL == "mps":
-        model_dict = None
-    else:
-        model_dict = create_model_dict()
-        for k, v in model_dict.items():
-            v.share_memory()
-    print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
-    task_args = [(f, kwargs) for f in files_to_convert]
-    with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,)) as pool:
-        list(tqdm(pool.imap(process_single_pdf, task_args), total=len(task_args), desc="Processing PDFs", unit="pdf"))
-        pool._worker_handler.terminate = worker_exit
-    # Delete all CUDA tensors
-    del model_dict
 if __name__ == "__main__":
-    main()

+from marker.scripts import convert_cli
 if __name__ == "__main__":
+    convert_cli()

convert_single.py CHANGED Viewed

@@ -1,43 +1,4 @@
-import os
-os.environ["GRPC_VERBOSITY"] = "ERROR"
-os.environ["GLOG_minloglevel"] = "2"
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
-import time
-import click
-from marker.config.parser import ConfigParser
-from marker.config.printer import CustomClickPrinter
-from marker.converters.pdf import PdfConverter
-from marker.logger import configure_logging
-from marker.models import create_model_dict
-from marker.output import save_output
-configure_logging()
-@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
-@click.argument("fpath", type=str)
-@ConfigParser.common_options
-def main(fpath: str, **kwargs):
-    models = create_model_dict()
-    start = time.time()
-    config_parser = ConfigParser(kwargs)
-    converter = PdfConverter(
-        config=config_parser.generate_config_dict(),
-        artifact_dict=models,
-        processor_list=config_parser.get_processors(),
-        renderer=config_parser.get_renderer()
-    )
-    rendered = converter(fpath)
-    out_folder = config_parser.get_output_folder(fpath)
-    save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
-    print(f"Saved markdown to {out_folder}")
-    print(f"Total time: {time.time() - start}")
 if __name__ == "__main__":
-    main()

+from marker.scripts import convert_single_cli
 if __name__ == "__main__":
+    convert_single_cli()

marker/builders/document.py CHANGED Viewed

@@ -22,11 +22,16 @@ class DocumentBuilder(BaseBuilder):
         int,
         "DPI setting for high-resolution page images used for OCR.",
     ] = 192
     def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
         document = self.build_document(provider)
         layout_builder(document, provider)
-        ocr_builder(document, provider)
         return document
     def build_document(self, provider: PdfProvider):

         int,
         "DPI setting for high-resolution page images used for OCR.",
     ] = 192
+    disable_ocr: Annotated[
+        bool,
+        "Disable OCR processing.",
+    ] = False
     def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
         document = self.build_document(provider)
         layout_builder(document, provider)
+        if not self.disable_ocr:
+            ocr_builder(document, provider)
         return document
     def build_document(self, provider: PdfProvider):

marker/builders/layout.py CHANGED Viewed

@@ -1,11 +1,10 @@
 from typing import Annotated, List, Optional, Tuple
 import numpy as np
-from surya.layout import batch_layout_detection
-from surya.model.layout.encoderdecoder import SuryaLayoutModel
-from surya.model.ocr_error.model import DistilBertForSequenceClassification
-from surya.ocr_error import batch_ocr_error_detection
-from surya.schema import LayoutResult, OCRErrorDetectionResult
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
@@ -51,15 +50,23 @@ class LayoutBuilder(BaseBuilder):
         Tuple[BlockTypes],
         "A list of block types to exclude from the layout coverage check.",
     ] = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
-    def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
         self.layout_model = layout_model
         self.ocr_error_model = ocr_error_model
         super().__init__(config)
     def __call__(self, document: Document, provider: PdfProvider):
-        layout_results = self.surya_layout(document.pages)
         self.add_blocks_to_pages(document.pages, layout_results)
         self.merge_blocks(document.pages, provider.page_lines)
@@ -70,12 +77,29 @@ class LayoutBuilder(BaseBuilder):
             return 6
         return 6
     def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
-        processor = self.layout_model.processor
-        layout_results = batch_layout_detection(
-            [p.lowres_image for p in pages],
-            self.layout_model,
-            processor,
             batch_size=int(self.get_batch_size())
         )
         return layout_results
@@ -97,10 +121,8 @@ class LayoutBuilder(BaseBuilder):
             page_texts.append(page_text)
-        ocr_error_detection_results = batch_ocr_error_detection(
             page_texts,
-            self.ocr_error_model,
-            self.ocr_error_model.tokenizer,
             batch_size=int(self.get_batch_size())  # TODO Better Multiplier
         )
         return ocr_error_detection_results

 from typing import Annotated, List, Optional, Tuple
 import numpy as np
+from surya.layout import LayoutPredictor
+from surya.layout.schema import LayoutResult, LayoutBox
+from surya.ocr_error import OCRErrorPredictor
+from surya.ocr_error.schema import OCRErrorDetectionResult
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
         Tuple[BlockTypes],
         "A list of block types to exclude from the layout coverage check.",
     ] = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
+    force_layout_block: Annotated[
+        str,
+        "Skip layout and force every page to be treated as a specific block type.",
+    ] = None
+    def __init__(self, layout_model: LayoutPredictor, ocr_error_model: OCRErrorPredictor, config=None):
         self.layout_model = layout_model
         self.ocr_error_model = ocr_error_model
         super().__init__(config)
     def __call__(self, document: Document, provider: PdfProvider):
+        if self.force_layout_block is not None:
+            # Assign the full content of every page to a single layout type
+            layout_results = self.forced_layout(document.pages)
+        else:
+            layout_results = self.surya_layout(document.pages)
         self.add_blocks_to_pages(document.pages, layout_results)
         self.merge_blocks(document.pages, provider.page_lines)
             return 6
         return 6
+    def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
+        layout_results = []
+        for page in pages:
+            layout_results.append(
+                LayoutResult(
+                    image_bbox=page.polygon.bbox,
+                    bboxes=[
+                        LayoutBox(
+                            label=self.force_layout_block,
+                            position=0,
+                            top_k={self.force_layout_block: 1},
+                            polygon=page.polygon.polygon,
+                        ),
+                    ],
+                    sliced=False
+                )
+            )
+        return layout_results
     def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
+        layout_results = self.layout_model(
+            [p.get_image(highres=False) for p in pages],
             batch_size=int(self.get_batch_size())
         )
         return layout_results
             page_texts.append(page_text)
+        ocr_error_detection_results = self.ocr_error_model(
             page_texts,
             batch_size=int(self.get_batch_size())  # TODO Better Multiplier
         )
         return ocr_error_detection_results

marker/builders/llm_layout.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Annotated, Optional
 from google.ai.generativelanguage_v1beta.types import content
-from surya.model.layout.encoderdecoder import SuryaLayoutModel
-from surya.model.ocr_error.model import DistilBertForSequenceClassification
 from tqdm import tqdm
 from marker.builders.layout import LayoutBuilder
@@ -24,16 +23,16 @@ class LLMLayoutBuilder(LayoutBuilder):
     """
     google_api_key: Annotated[
-        Optional[str],
         "The Google API key to use for the Gemini model.",
     ] = settings.GOOGLE_API_KEY
     confidence_threshold: Annotated[
         float,
-        "The confidence threshold to use for relabeling.",
-    ] = 0.75
     picture_height_threshold: Annotated[
         float,
-        "The height threshold for pictures that may actually be complex regions.",
     ] = 0.8
     model_name: Annotated[
         str,
@@ -55,43 +54,47 @@ class LLMLayoutBuilder(LayoutBuilder):
         str,
         "The prompt to use for relabelling blocks.",
         "Default is a string containing the Gemini relabelling prompt."
-    ] = """You are a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
-You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
 Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
 Do not invent any new labels.
-Carefully examine the image and consider the provided predictions.
-Choose the label you believe is the most accurate representation of the layout block.
-Here are the top k predictions from the model followed by the image:
 """
     complex_relabeling_prompt: Annotated[
         str,
         "The prompt to use for complex relabelling blocks.",
         "Default is a string containing the complex relabelling prompt."
-    ] = """You are a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
-You will be provided with an image of a layout block and some potential labels.
 Your job is to analyze the image and choose the single most appropriate label from the provided labels.
 Do not invent any new labels.
-Carefully examine the image and consider the provided predictions.
-Choose the label you believe is the most accurate representation of the layout block.
 Potential labels:
-- Picture
-- Table
-- Form
-- Figure - A graph or diagram with text.
-- ComplexRegion - a complex region containing multiple text and other elements.
 Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.
-Here is the image of the layout block:
 """
-    def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
         super().__init__(layout_model, ocr_error_model, config)
         self.model = GoogleModel(self.google_api_key, self.model_name)
@@ -114,10 +117,10 @@ Here is the image of the layout block:
                         confidence = block.top_k.get(block.block_type)
                         # Case when the block is detected as a different type with low confidence
                         if confidence < self.confidence_threshold:
-                            futures.append(executor.submit(self.process_block_topk_relabeling, page, block))
                         # Case when the block is detected as a picture or figure, but is actually complex
                         elif block.block_type in (BlockTypes.Picture, BlockTypes.Figure, BlockTypes.SectionHeader) and block.polygon.height > page.polygon.height * self.picture_height_threshold:
-                            futures.append(executor.submit(self.process_block_complex_relabeling, page, block))
             for future in as_completed(futures):
                 future.result()  # Raise exceptions if any occurred
@@ -125,23 +128,40 @@ Here is the image of the layout block:
         pbar.close()
-    def process_block_topk_relabeling(self, page: PageGroup, block: Block):
-        topk = {str(k): round(v, 3) for k, v in block.top_k.items()}
-        prompt = self.topk_relabelling_prompt + '```json' + json.dumps(topk) + '```\n'
-        return self.process_block_relabeling(page, block, prompt)
-    def process_block_complex_relabeling(self, page: PageGroup, block: Block):
-        complex_prompt = self.complex_relabeling_prompt
-        return self.process_block_relabeling(page, block, complex_prompt)
-    def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
-        image = self.extract_image(page, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
-            required=["label"],
             properties={
                 "label": content.Schema(
                     type=content.Type.STRING,
                 ),
@@ -162,10 +182,5 @@ Here is the image of the layout block:
             )
             page.replace_block(block, generated_block)
-    def extract_image(self, page: PageGroup, image_block: Block, expand: float = 0.01):
-        page_img = page.lowres_image
-        image_box = image_block.polygon\
-            .rescale(page.polygon.size, page_img.size)\
-            .expand(expand, expand)
-        cropped = page_img.crop(image_box.bbox)
-        return cropped

 from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Annotated
 from google.ai.generativelanguage_v1beta.types import content
+from surya.layout import LayoutPredictor
+from surya.ocr_error import OCRErrorPredictor
 from tqdm import tqdm
 from marker.builders.layout import LayoutBuilder
     """
     google_api_key: Annotated[
+        str,
         "The Google API key to use for the Gemini model.",
     ] = settings.GOOGLE_API_KEY
     confidence_threshold: Annotated[
         float,
+        "The confidence threshold to use for relabeling (anything below is relabeled).",
+    ] = 0.7
     picture_height_threshold: Annotated[
         float,
+        "The height threshold for pictures that may actually be complex regions. (anything above this ratio against the page is relabeled)",
     ] = 0.8
     model_name: Annotated[
         str,
         str,
         "The prompt to use for relabelling blocks.",
         "Default is a string containing the Gemini relabelling prompt."
+    ] = """You're a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
+You will be provided with an image of a layout block and the top k predictions from the current model, along with the per-label confidence scores.
 Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
 Do not invent any new labels.
+Carefully examine the image and consider the provided predictions.  Take the model confidence scores into account.  The confidence is reported on a 0-1 scale, with 1 being 100% confident.  If the existing label is the most appropriate, you should not change it.
+**Instructions**
+1. Analyze the image and consider the provided top k predictions.
+2. Write a short description of the image, and which of the potential labels you believe is the most accurate representation of the layout block.
+3. Choose the single most appropriate label from the provided top k predictions.
+Here are descriptions of the layout blocks you can choose from:
+{potential_labels}
+Here are the top k predictions from the model:
+{top_k}
 """
     complex_relabeling_prompt: Annotated[
         str,
         "The prompt to use for complex relabelling blocks.",
         "Default is a string containing the complex relabelling prompt."
+    ] = """You're a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
+You will be provided with an image of a layout block and some potential labels that might be appropriate.
 Your job is to analyze the image and choose the single most appropriate label from the provided labels.
 Do not invent any new labels.
+**Instructions**
+1. Analyze the image and consider the potential labels.
+2. Write a short description of the image, and which of the potential labels you believe is the most accurate representation of the layout block.
+3. Choose the single most appropriate label from the provided labels.
 Potential labels:
+{potential_labels}
 Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form`.
 """
+    def __init__(self, layout_model: LayoutPredictor, ocr_error_model: OCRErrorPredictor, config=None):
         super().__init__(layout_model, ocr_error_model, config)
         self.model = GoogleModel(self.google_api_key, self.model_name)
                         confidence = block.top_k.get(block.block_type)
                         # Case when the block is detected as a different type with low confidence
                         if confidence < self.confidence_threshold:
+                            futures.append(executor.submit(self.process_block_topk_relabeling, document, page, block))
                         # Case when the block is detected as a picture or figure, but is actually complex
                         elif block.block_type in (BlockTypes.Picture, BlockTypes.Figure, BlockTypes.SectionHeader) and block.polygon.height > page.polygon.height * self.picture_height_threshold:
+                            futures.append(executor.submit(self.process_block_complex_relabeling, document, page, block))
             for future in as_completed(futures):
                 future.result()  # Raise exceptions if any occurred
         pbar.close()
+    def process_block_topk_relabeling(self, document: Document, page: PageGroup, block: Block):
+        topk_types = list(block.top_k.keys())
+        potential_labels = ""
+        for block_type in topk_types:
+            label_cls = get_block_class(block_type)
+            potential_labels += f"- `{block_type}` - {label_cls.model_fields['block_description'].default}\n"
+        topk = ""
+        for k,v in block.top_k.items():
+            topk += f"- `{k}` - Confidence {round(v, 3)}\n"
+        prompt = self.topk_relabelling_prompt.replace("{potential_labels}", potential_labels).replace("{top_k}", topk)
+        return self.process_block_relabeling(document, page, block, prompt)
+    def process_block_complex_relabeling(self, document: Document, page: PageGroup, block: Block):
+        potential_labels = ""
+        for block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.ComplexRegion, BlockTypes.Table, BlockTypes.Form]:
+            label_cls = get_block_class(block_type)
+            potential_labels += f"- `{block_type}` - {label_cls.model_fields['block_description'].default}\n"
+        complex_prompt = self.complex_relabeling_prompt.replace("{potential_labels}", potential_labels)
+        return self.process_block_relabeling(document, page, block, complex_prompt)
+    def process_block_relabeling(self, document: Document, page: PageGroup, block: Block, prompt: str):
+        image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
+            required=["image_description", "label"],
             properties={
+                "image_description": content.Schema(
+                    type=content.Type.STRING,
+                ),
                 "label": content.Schema(
                     type=content.Type.STRING,
                 ),
             )
             page.replace_block(block, generated_block)
+    def extract_image(self, document: Document, image_block: Block, expand: float = 0.01):
+        return image_block.get_image(document, highres=False, expansion=(expand, expand))

marker/builders/ocr.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from typing import Annotated, List, Optional
 from ftfy import fix_text
-from surya.model.detection.model import EfficientViTForSemanticSegmentation
-from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
-from surya.ocr import run_ocr
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
@@ -37,7 +36,7 @@ class OcrBuilder(BaseBuilder):
         "Default is None."
     ] = None
-    def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
         super().__init__(config)
         self.detection_model = detection_model
@@ -65,16 +64,13 @@ class OcrBuilder(BaseBuilder):
     def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderPageLines:
         page_list = [page for page in document.pages if page.text_extraction_method == "surya"]
-        recognition_results = run_ocr(
-            images=[page.lowres_image for page in page_list],
             langs=[self.languages] * len(page_list),
-            det_model=self.detection_model,
-            det_processor=self.detection_model.processor,
-            rec_model=self.recognition_model,
-            rec_processor=self.recognition_model.processor,
             detection_batch_size=int(self.get_detection_batch_size()),
             recognition_batch_size=int(self.get_recognition_batch_size()),
-            highres_images=[page.highres_image for page in page_list]
         )
         page_lines = {}

 from typing import Annotated, List, Optional
 from ftfy import fix_text
+from surya.detection import DetectionPredictor
+from surya.recognition import RecognitionPredictor
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
         "Default is None."
     ] = None
+    def __init__(self, detection_model: DetectionPredictor, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
         self.detection_model = detection_model
     def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderPageLines:
         page_list = [page for page in document.pages if page.text_extraction_method == "surya"]
+        recognition_results = self.recognition_model(
+            images=[page.get_image(highres=False) for page in page_list],
             langs=[self.languages] * len(page_list),
+            det_predictor=self.detection_model,
             detection_batch_size=int(self.get_detection_batch_size()),
             recognition_batch_size=int(self.get_recognition_batch_size()),
+            highres_images=[page.get_image(highres=True) for page in page_list]
         )
         page_lines = {}

marker/config/parser.py CHANGED Viewed

@@ -5,11 +5,13 @@ from typing import Dict
 import click
 from marker.config.crawler import crawler
 from marker.renderers.html import HTMLRenderer
 from marker.renderers.json import JSONRenderer
 from marker.renderers.markdown import MarkdownRenderer
 from marker.settings import settings
 from marker.util import classes_to_strings, parse_range_str, strings_to_classes
 class ConfigParser:
@@ -39,6 +41,10 @@ class ConfigParser:
         # we put common options here
         fn = click.option("--google_api_key", type=str, default=None, help="Google API key for using LLMs.")(fn)
         fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
         return fn
     def generate_config_dict(self) -> Dict[str, any]:
@@ -95,6 +101,17 @@ class ConfigParser:
         return processors
     def get_output_folder(self, filepath: str):
         output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
         fname_base = os.path.splitext(os.path.basename(filepath))[0]

 import click
 from marker.config.crawler import crawler
+from marker.converters.pdf import PdfConverter
 from marker.renderers.html import HTMLRenderer
 from marker.renderers.json import JSONRenderer
 from marker.renderers.markdown import MarkdownRenderer
 from marker.settings import settings
 from marker.util import classes_to_strings, parse_range_str, strings_to_classes
+from marker.schema import BlockTypes
 class ConfigParser:
         # we put common options here
         fn = click.option("--google_api_key", type=str, default=None, help="Google API key for using LLMs.")(fn)
         fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
+        fn = click.option("--converter_cls", type=str, default=None, help="Converter class to use.  Defaults to PDF converter.")(fn)
+        # enum options
+        fn = click.option("--force_layout_block", type=click.Choice(choices=[t.name for t in BlockTypes]), default=None,)(fn)
         return fn
     def generate_config_dict(self) -> Dict[str, any]:
         return processors
+    def get_converter_cls(self):
+        converter_cls = self.cli_options.get("converter_cls", None)
+        if converter_cls is not None:
+            try:
+                return strings_to_classes([converter_cls])[0]
+            except Exception as e:
+                print(f"Error loading converter: {converter_cls} with error: {e}")
+                raise
+        return PdfConverter
     def get_output_folder(self, filepath: str):
         output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
         fname_base = os.path.splitext(os.path.basename(filepath))[0]

marker/config/printer.py CHANGED Viewed

@@ -6,19 +6,47 @@ from marker.config.crawler import crawler
 class CustomClickPrinter(click.Command):
-    def get_help(self, ctx):
-        additional_help = (
-            "\n\nTip: Use 'config --help' to display all the attributes of the Builders, Processors, and Converters in Marker."
-        )
-        help_text = super().get_help(ctx)
-        help_text = help_text + additional_help
-        click.echo(help_text)
     def parse_args(self, ctx, args):
         display_help = 'config' in args and '--help' in args
         if display_help:
-            click.echo("Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:")
         for base_type, base_type_dict in crawler.class_config_map.items():
             if display_help:
                 click.echo(f"{base_type}s:")
@@ -32,16 +60,14 @@ class CustomClickPrinter(click.Command):
                     if display_help:
                         click.echo(" " * 8 + f"{attr} ({formatted_type}):")
                         click.echo("\n".join([f'{" " * 12}' + desc for desc in metadata]))
-                    if attr_type in [str, int, float, bool, Optional[int], Optional[float], Optional[str]]:
                         is_flag = attr_type in [bool, Optional[bool]] and not default
-                        if crawler.attr_counts.get(attr) > 1:
-                            options = ["--" + class_name_attr]
-                        else:
-                            options = ["--" + attr, "--" + class_name_attr]
-                        options.append(class_name_attr)
                         ctx.command.params.append(
                             click.Option(
-                                options,
                                 type=attr_type,
                                 help=" ".join(metadata),
                                 is_flag=is_flag,

 class CustomClickPrinter(click.Command):
     def parse_args(self, ctx, args):
         display_help = 'config' in args and '--help' in args
         if display_help:
+            click.echo(
+                "Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:")
+        # Keep track of shared attributes and their types
+        shared_attrs = {}
+        # First pass: identify shared attributes and verify compatibility
+        for base_type, base_type_dict in crawler.class_config_map.items():
+            for class_name, class_map in base_type_dict.items():
+                for attr, (attr_type, formatted_type, default, metadata) in class_map['config'].items():
+                    if attr not in shared_attrs:
+                        shared_attrs[attr] = {
+                            'classes': [],
+                            'type': attr_type,
+                            'is_flag': attr_type in [bool, Optional[bool]] and not default,
+                            'metadata': metadata,
+                            'default': default
+                        }
+                    shared_attrs[attr]['classes'].append(class_name)
+        # These are the types of attrs that can be set from the command line
+        attr_types = [str, int, float, bool, Optional[int], Optional[float], Optional[str]]
+        # Add shared attribute options first
+        for attr, info in shared_attrs.items():
+            if info['type'] in attr_types:
+                ctx.command.params.append(
+                    click.Option(
+                        ["--" + attr],
+                        type=info['type'],
+                        help=" ".join(info['metadata']) + f" (Applies to: {', '.join(info['classes'])})",
+                        default=info['default'],
+                        is_flag=info['is_flag'],
+                    )
+                )
+        # Second pass: create class-specific options
         for base_type, base_type_dict in crawler.class_config_map.items():
             if display_help:
                 click.echo(f"{base_type}s:")
                     if display_help:
                         click.echo(" " * 8 + f"{attr} ({formatted_type}):")
                         click.echo("\n".join([f'{" " * 12}' + desc for desc in metadata]))
+                    if attr_type in attr_types:
                         is_flag = attr_type in [bool, Optional[bool]] and not default
+                        # Only add class-specific options
                         ctx.command.params.append(
                             click.Option(
+                                ["--" + class_name_attr, class_name_attr],
                                 type=attr_type,
                                 help=" ".join(metadata),
                                 is_flag=is_flag,

marker/converters/pdf.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 import inspect
 from collections import defaultdict
 from functools import cache
-from typing import Annotated, Any, Dict, List, Optional, Type
 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.builders.llm_layout import LLMLayoutBuilder
@@ -32,12 +34,13 @@ from marker.processors.reference import ReferenceProcessor
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
 from marker.processors.text import TextProcessor
-from marker.providers.pdf import PdfProvider
 from marker.renderers.markdown import MarkdownRenderer
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.registry import register_block_class
 from marker.util import strings_to_classes
 class PdfConverter(BaseConverter):
@@ -55,6 +58,30 @@ class PdfConverter(BaseConverter):
         bool,
         "Enable higher quality processing with LLMs.",
     ] = False
     def __init__(self, artifact_dict: Dict[str, Any], processor_list: Optional[List[str]] = None, renderer: str | None = None, config=None):
         super().__init__(config)
@@ -65,27 +92,7 @@ class PdfConverter(BaseConverter):
         if processor_list:
             processor_list = strings_to_classes(processor_list)
         else:
-            processor_list = [
-                BlockquoteProcessor,
-                CodeProcessor,
-                DocumentTOCProcessor,
-                EquationProcessor,
-                FootnoteProcessor,
-                IgnoreTextProcessor,
-                LineNumbersProcessor,
-                ListProcessor,
-                PageHeaderProcessor,
-                SectionHeaderProcessor,
-                TableProcessor,
-                LLMTableProcessor,
-                LLMFormProcessor,
-                TextProcessor,
-                LLMTextProcessor,
-                LLMComplexRegionProcessor,
-                LLMImageDescriptionProcessor,
-                ReferenceProcessor,
-                DebugProcessor,
-            ]
         if renderer:
             renderer = strings_to_classes([renderer])[0]
@@ -121,11 +128,11 @@ class PdfConverter(BaseConverter):
     @cache
     def build_document(self, filepath: str):
         layout_builder = self.resolve_dependencies(self.layout_builder_class)
         ocr_builder = self.resolve_dependencies(OcrBuilder)
-        with PdfProvider(filepath, self.config) as pdf_provider:
-            document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder)
         StructureBuilder(self.config)(document)
         for processor_cls in self.processor_list:

 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 import inspect
 from collections import defaultdict
 from functools import cache
+from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
+from marker.processors import BaseProcessor
+from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
+from marker.providers.registry import provider_from_filepath
 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.builders.llm_layout import LLMLayoutBuilder
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
 from marker.processors.text import TextProcessor
+from marker.processors.llm.llm_equation import LLMEquationProcessor
 from marker.renderers.markdown import MarkdownRenderer
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.registry import register_block_class
 from marker.util import strings_to_classes
+from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
 class PdfConverter(BaseConverter):
         bool,
         "Enable higher quality processing with LLMs.",
     ] = False
+    default_processors: Tuple[BaseProcessor, ...] = (
+        BlockquoteProcessor,
+        CodeProcessor,
+        DocumentTOCProcessor,
+        EquationProcessor,
+        FootnoteProcessor,
+        IgnoreTextProcessor,
+        LineNumbersProcessor,
+        ListProcessor,
+        PageHeaderProcessor,
+        SectionHeaderProcessor,
+        TableProcessor,
+        LLMTableProcessor,
+        LLMTableMergeProcessor,
+        LLMFormProcessor,
+        TextProcessor,
+        LLMTextProcessor,
+        LLMComplexRegionProcessor,
+        LLMImageDescriptionProcessor,
+        LLMEquationProcessor,
+        LLMHandwritingProcessor,
+        ReferenceProcessor,
+        DebugProcessor,
+    )
     def __init__(self, artifact_dict: Dict[str, Any], processor_list: Optional[List[str]] = None, renderer: str | None = None, config=None):
         super().__init__(config)
         if processor_list:
             processor_list = strings_to_classes(processor_list)
         else:
+            processor_list = self.default_processors
         if renderer:
             renderer = strings_to_classes([renderer])[0]
     @cache
     def build_document(self, filepath: str):
+        provider_cls = provider_from_filepath(filepath)
         layout_builder = self.resolve_dependencies(self.layout_builder_class)
         ocr_builder = self.resolve_dependencies(OcrBuilder)
+        with provider_cls(filepath, self.config) as provider:
+            document = DocumentBuilder(self.config)(provider, layout_builder, ocr_builder)
         StructureBuilder(self.config)(document)
         for processor_cls in self.processor_list:

marker/converters/table.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from functools import cache
+from typing import Tuple, List
+from marker.builders.document import DocumentBuilder
+from marker.builders.ocr import OcrBuilder
+from marker.converters.pdf import PdfConverter
+from marker.processors import BaseProcessor
+from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
+from marker.processors.llm.llm_form import LLMFormProcessor
+from marker.processors.llm.llm_table import LLMTableProcessor
+from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
+from marker.processors.table import TableProcessor
+from marker.providers.registry import provider_from_filepath
+from marker.schema import BlockTypes
+class TableConverter(PdfConverter):
+    default_processors: Tuple[BaseProcessor, ...] = (
+        TableProcessor,
+        LLMTableProcessor,
+        LLMTableMergeProcessor,
+        LLMFormProcessor,
+        LLMComplexRegionProcessor,
+    )
+    converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents)
+    @cache
+    def build_document(self, filepath: str):
+        provider_cls = provider_from_filepath(filepath)
+        layout_builder = self.resolve_dependencies(self.layout_builder_class)
+        ocr_builder = self.resolve_dependencies(OcrBuilder)
+        document_builder = DocumentBuilder(self.config)
+        document_builder.disable_ocr = True
+        with provider_cls(filepath, self.config) as provider:
+            document = document_builder(provider, layout_builder, ocr_builder)
+        for page in document.pages:
+            page.structure = [p for p in page.structure if p.block_type in self.converter_block_types]
+        for processor_cls in self.processor_list:
+            processor = self.resolve_dependencies(processor_cls)
+            processor(document)
+        return document
+    def __call__(self, filepath: str):
+        document = self.build_document(filepath)
+        renderer = self.resolve_dependencies(self.renderer)
+        return renderer(document)

marker/models.py CHANGED Viewed

@@ -1,86 +1,49 @@
 import os
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
-from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
-from surya.model.layout.model import load_model as load_layout_model
-from surya.model.layout.processor import load_processor as load_layout_processor
-from texify.model.model import load_model as load_texify_model
-from texify.model.processor import load_processor as load_texify_processor
 from marker.settings import settings
-from surya.model.recognition.model import load_model as load_recognition_model
-from surya.model.recognition.processor import load_processor as load_recognition_processor
-from surya.model.table_rec.model import load_model as load_table_model
-from surya.model.table_rec.processor import load_processor as load_table_processor
-from surya.model.ocr_error.model import load_model as load_ocr_error_model
-from surya.model.ocr_error.model import load_tokenizer as load_ocr_error_tokenizer
-from texify.model.model import GenerateVisionEncoderDecoderModel
-from surya.model.layout.encoderdecoder import SuryaLayoutModel
-from surya.model.detection.model import EfficientViTForSemanticSegmentation
-from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
-from surya.model.table_rec.encoderdecoder import TableRecEncoderDecoderModel
-from surya.model.ocr_error.model import DistilBertForSequenceClassification
-def setup_table_rec_model(device=None, dtype=None) -> TableRecEncoderDecoderModel:
-    if device:
-        table_model = load_table_model(device=device, dtype=dtype)
-    else:
-        table_model = load_table_model()
-    table_model.processor = load_table_processor()
-    return table_model
-def setup_recognition_model(device=None, dtype=None) -> OCREncoderDecoderModel:
-    if device:
-        rec_model = load_recognition_model(device=device, dtype=dtype)
-    else:
-        rec_model = load_recognition_model()
-    rec_model.processor = load_recognition_processor()
-    return rec_model
-def setup_detection_model(device=None, dtype=None) -> EfficientViTForSemanticSegmentation:
-    if device:
-        model = load_detection_model(device=device, dtype=dtype)
-    else:
-        model = load_detection_model()
-    model.processor = load_detection_processor()
-    return model
-def setup_texify_model(device=None, dtype=None) -> GenerateVisionEncoderDecoderModel:
-    if device:
-        texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
-    else:
-        texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
-    texify_model.processor = load_texify_processor()
-    return texify_model
-def setup_layout_model(device=None, dtype=None) -> SuryaLayoutModel:
-    if device:
-        model = load_layout_model(device=device, dtype=dtype)
-    else:
-        model = load_layout_model()
-    model.processor = load_layout_processor()
-    return model
-def setup_ocr_error_model(device=None, dtype=None) -> DistilBertForSequenceClassification:
-    if device:
-        model = load_ocr_error_model(device=device, dtype=dtype)
-    else:
-        model = load_ocr_error_model()
-    model.tokenizer = load_ocr_error_tokenizer()
-    return model
 def create_model_dict(device=None, dtype=None) -> dict:
     return {
-        "layout_model": setup_layout_model(device, dtype),
-        "texify_model": setup_texify_model(device, dtype),
-        "recognition_model": setup_recognition_model(device, dtype),
-        "table_rec_model": setup_table_rec_model(device, dtype),
-        "detection_model": setup_detection_model(device, dtype),
-        "ocr_error_model": setup_ocr_error_model(device,dtype)
     }

 import os
 from marker.settings import settings
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
+from typing import List
+from PIL import Image
+from surya.detection import DetectionPredictor
+from surya.layout import LayoutPredictor
+from surya.ocr_error import OCRErrorPredictor
+from surya.recognition import RecognitionPredictor
+from surya.table_rec import TableRecPredictor
+from texify.model.model import load_model as load_texify_model
+from texify.model.processor import load_processor as load_texify_processor
+from texify.inference import batch_inference
+class TexifyPredictor:
+    def __init__(self, device=None, dtype=None):
+        if not device:
+            device = settings.TORCH_DEVICE_MODEL
+        if not dtype:
+            dtype = settings.TEXIFY_DTYPE
+        self.model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
+        self.processor = load_texify_processor()
+        self.device = device
+        self.dtype = dtype
+    def __call__(self, batch_images: List[Image.Image], max_tokens: int):
+        return batch_inference(
+            batch_images,
+            self.model,
+            self.processor,
+            max_tokens=max_tokens
+        )
 def create_model_dict(device=None, dtype=None) -> dict:
     return {
+        "layout_model": LayoutPredictor(device=device, dtype=dtype),
+        "texify_model": TexifyPredictor(device=device, dtype=dtype),
+        "recognition_model": RecognitionPredictor(device=device, dtype=dtype),
+        "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
+        "detection_model": DetectionPredictor(device=device, dtype=dtype),
+        "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype)
     }

marker/processors/debug.py CHANGED Viewed

@@ -68,7 +68,7 @@ class DebugProcessor(BaseProcessor):
     def draw_pdf_debug_images(self, document: Document):
         for page in document.pages:
-            png_image = page.highres_image.copy()
             line_bboxes = []
             span_bboxes = []
@@ -90,7 +90,7 @@ class DebugProcessor(BaseProcessor):
     def draw_layout_debug_images(self, document: Document, pdf_mode=False):
         for page in document.pages:
-            img_size = page.highres_image.size
             png_image = Image.new("RGB", img_size, color="white")
             line_bboxes = []

     def draw_pdf_debug_images(self, document: Document):
         for page in document.pages:
+            png_image = page.get_image(highres=True).copy()
             line_bboxes = []
             span_bboxes = []
     def draw_layout_debug_images(self, document: Document, pdf_mode=False):
         for page in document.pages:
+            img_size = page.get_image(highres=True).size
             png_image = Image.new("RGB", img_size, color="white")
             line_bboxes = []

marker/processors/equation.py CHANGED Viewed

@@ -4,6 +4,7 @@ from texify.inference import batch_inference
 from texify.model.model import GenerateVisionEncoderDecoderModel
 from tqdm import tqdm
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
@@ -32,7 +33,7 @@ class EquationProcessor(BaseProcessor):
         "The number of tokens to buffer above max for the Texify model.",
     ] = 256
-    def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
         super().__init__(config)
         self.texify_model = texify_model
@@ -42,8 +43,7 @@ class EquationProcessor(BaseProcessor):
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
-                image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
-                image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
                 raw_text = block.raw_text(document)
                 token_count = self.get_total_texify_tokens(raw_text)
@@ -92,10 +92,8 @@ class EquationProcessor(BaseProcessor):
             batch_images = [eq["image"] for eq in batch_equations]
-            model_output = batch_inference(
                 batch_images,
-                self.texify_model,
-                self.texify_model.processor,
                 max_tokens=max_length
             )

 from texify.model.model import GenerateVisionEncoderDecoderModel
 from tqdm import tqdm
+from marker.models import TexifyPredictor
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
         "The number of tokens to buffer above max for the Texify model.",
     ] = 256
+    def __init__(self, texify_model: TexifyPredictor, config=None):
         super().__init__(config)
         self.texify_model = texify_model
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
+                image = block.get_image(document, highres=False).convert("RGB")
                 raw_text = block.raw_text(document)
                 token_count = self.get_total_texify_tokens(raw_text)
             batch_images = [eq["image"] for eq in batch_equations]
+            model_output = self.texify_model(
                 batch_images,
                 max_tokens=max_length
             )

marker/processors/ignoretext.py CHANGED Viewed

@@ -17,8 +17,7 @@ class IgnoreTextProcessor(BaseProcessor):
     These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
     """
     block_types = (
-        BlockTypes.Text, BlockTypes.PageHeader,
-        BlockTypes.PageFooter, BlockTypes.SectionHeader,
         BlockTypes.TextInlineMath
     )
     common_element_threshold: Annotated[
@@ -47,7 +46,6 @@ class IgnoreTextProcessor(BaseProcessor):
         last_blocks = []
         for page in document.pages:
             initial_block = None
-            block = None
             last_block = None
             for block in page.contained_blocks(document, self.block_types):
                 if block.structure is not None:

     These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
     """
     block_types = (
+        BlockTypes.Text, BlockTypes.SectionHeader,
         BlockTypes.TextInlineMath
     )
     common_element_threshold: Annotated[
         last_blocks = []
         for page in document.pages:
             initial_block = None
             last_block = None
             for block in page.contained_blocks(document, self.block_types):
                 if block.structure is not None:

marker/processors/llm/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Annotated, Optional
@@ -16,7 +17,7 @@ class BaseLLMProcessor(BaseProcessor):
     A processor for using LLMs to convert blocks.
     """
     google_api_key: Annotated[
-        Optional[str],
         "The Google API key to use for the Gemini model.",
     ] = settings.GOOGLE_API_KEY
     model_name: Annotated[
@@ -39,11 +40,6 @@ class BaseLLMProcessor(BaseProcessor):
         float,
         "The ratio to expand the image by when cropping.",
     ] = 0.01
-    gemini_rewriting_prompt: Annotated[
-        str,
-        "The prompt to use for rewriting text.",
-        "Default is a string containing the Gemini rewriting prompt."
-    ] = ''
     use_llm: Annotated[
         bool,
         "Whether to use the LLM model.",
@@ -84,10 +80,5 @@ class BaseLLMProcessor(BaseProcessor):
         pbar.close()
-    def extract_image(self, page: PageGroup, image_block: Block):
-        page_img = page.lowres_image
-        image_box = image_block.polygon\
-            .rescale(page.polygon.size, page_img.size)\
-            .expand(self.image_expansion_ratio, self.image_expansion_ratio)
-        cropped = page_img.crop(image_box.bbox)
-        return cropped

+import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Annotated, Optional
     A processor for using LLMs to convert blocks.
     """
     google_api_key: Annotated[
+        str,
         "The Google API key to use for the Gemini model.",
     ] = settings.GOOGLE_API_KEY
     model_name: Annotated[
         float,
         "The ratio to expand the image by when cropping.",
     ] = 0.01
     use_llm: Annotated[
         bool,
         "Whether to use the LLM model.",
         pbar.close()
+    def extract_image(self, document: Document, image_block: Block):
+        return image_block.get_image(document, highres=False, expansion=(self.image_expansion_ratio, self.image_expansion_ratio))

marker/processors/llm/llm_complex.py CHANGED Viewed

@@ -12,9 +12,9 @@ from marker.schema.groups.page import PageGroup
 class LLMComplexRegionProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.ComplexRegion,)
-    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and the text that can be extracted from the image.
-Your task is to correct any errors in the text, and format it properly.
 Formatting should be in markdown, with the following rules:
 - * for italics, ** for bold, and ` for inline code.
@@ -29,27 +29,32 @@ Formatting should be in markdown, with the following rules:
 **Instructions:**
 1. Carefully examine the provided block image.
-2. Analyze the text representation
-3. If the text representation is largely correct, then write "No corrections needed."
-4. If the text representation contains errors, generate the corrected markdown representation.
-5. Output only either the corrected markdown representation or "No corrections needed."
 **Example:**
 Input:
 ```text
-This is an example text block.
 ```
 Output:
 ```markdown
-No corrections needed.
 ```
 **Input:**
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
         text = block.raw_text(document)
-        prompt = self.gemini_rewriting_prompt + '```text\n`' + text + '`\n```\n'
-        image = self.extract_image(page, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
@@ -79,4 +84,5 @@ No corrections needed.
             return
         # Convert LLM markdown to html
         block.html = markdown2.markdown(corrected_markdown)

 class LLMComplexRegionProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.ComplexRegion,)
+    complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and the text that can be extracted from the image.
+Your task is to generate markdown to properly represent the content of the image.  Do not omit any text present in the image - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original image as possible.
 Formatting should be in markdown, with the following rules:
 - * for italics, ** for bold, and ` for inline code.
 **Instructions:**
 1. Carefully examine the provided block image.
+2. Analyze the existing text representation.
+3. Generate the markdown representation of the content in the image.
 **Example:**
 Input:
 ```text
+Table 1: Car Sales
 ```
 Output:
 ```markdown
+## Table 1: Car Sales
+| Car | Sales |
+| --- | --- |
+| Honda | 100 |
+| Toyota | 200 |
 ```
 **Input:**
+```text
+{extracted_text}
+```
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
         text = block.raw_text(document)
+        prompt = self.complex_region_prompt.replace("{extracted_text}", text)
+        image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
             return
         # Convert LLM markdown to html
+        corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip()
         block.html = markdown2.markdown(corrected_markdown)

marker/processors/llm/llm_equation.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from marker.processors.llm import BaseLLMProcessor
+from google.ai.generativelanguage_v1beta.types import content
+from marker.schema import BlockTypes
+from marker.schema.blocks import Equation
+from marker.schema.document import Document
+from marker.schema.groups.page import PageGroup
+from typing import Annotated
+class LLMEquationProcessor(BaseLLMProcessor):
+    block_types = (BlockTypes.Equation,)
+    min_equation_height: Annotated[
+        float,
+        "The minimum ratio between equation height and page height to consider for processing.",
+     ] = 0.1
+    equation_latex_prompt: Annotated[
+        str,
+        "The prompt to use for generating LaTeX from equations.",
+        "Default is a string containing the Gemini prompt."
+    ] = """You're an expert mathematician who is good at writing LaTeX code for equations'.
+You will receive an image of a math block that may contain one or more equations. Your job is to write the LaTeX code for the equation, along with markdown for any other text.
+Some guidelines:
+- Keep the LaTeX code simple and concise.
+- Make it KaTeX compatible.
+- Use $$ as a block equation delimiter and $ for inline equations.  Block equations should also be on their own line.  Do not use any other delimiters.
+- You can include text in between equation blocks as needed.  Try to put long text segments into plain text and not inside the equations.
+**Instructions:**
+1. Carefully examine the provided image.
+2. Analyze the existing markdown, which may include LaTeX code.
+3. If the markdown and LaTeX are correct, write "No corrections needed."
+4. If the markdown and LaTeX are incorrect, generate the corrected markdown and LaTeX.
+5. Output only the corrected text or "No corrections needed."
+**Example:**
+Input:
+```markdown
+Equation 1:
+$$x^2 + y^2 = z2$$
+```
+Output:
+```markdown
+Equation 1:
+$$x^2 + y^2 = z^2$$
+```
+**Input:**
+```markdown
+{equation}
+```
+"""
+    def process_rewriting(self, document: Document, page: PageGroup, block: Equation):
+        text = block.latex if block.latex else block.raw_text(document)
+        prompt = self.equation_latex_prompt.replace("{equation}", text)
+        image = self.extract_image(document, block)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            enum=[],
+            required=["markdown_equation"],
+            properties={
+                "markdown_equation": content.Schema(
+                    type=content.Type.STRING
+                )
+            },
+        )
+        response = self.model.generate_response(prompt, image, block, response_schema)
+        if not response or "markdown_equation" not in response:
+            block.update_metadata(llm_error_count=1)
+            return
+        markdown_equation = response["markdown_equation"]
+        if len(markdown_equation) < len(text) * .5:
+            block.update_metadata(llm_error_count=1)
+            return
+        block.latex = markdown_equation

marker/processors/llm/llm_form.py CHANGED Viewed

@@ -1,9 +1,6 @@
-import markdown2
 from marker.processors.llm import BaseLLMProcessor
 from google.ai.generativelanguage_v1beta.types import content
-from tabled.formats import markdown_format
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
@@ -13,48 +10,75 @@ from marker.schema.groups.page import PageGroup
 class LLMFormProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.Form,)
-    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
-You will receive an image of a text block and a markdown representation of the form in the image.
-Your task is to correct any errors in the markdown representation, and format it properly.
-Values and labels should appear in markdown tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.
 **Instructions:**
 1. Carefully examine the provided form block image.
-2. Analyze the markdown representation of the form.
-3. If the markdown representation is largely correct, then write "No corrections needed."
-4. If the markdown representation contains errors, generate the corrected markdown representation.
-5. Output only either the corrected markdown representation or "No corrections needed."
 **Example:**
 Input:
-```markdown
-| Label 1 | Label 2 | Label 3 |
-|----------|----------|----------|
-| Value 1  | Value 2  | Value 3  |
 ```
 Output:
-```markdown
-| Labels | Values |
-|--------|--------|
-| Label 1 | Value 1 |
-| Label 2 | Value 2 |
-| Label 3 | Value 3 |
 ```
 **Input:**
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
-        cells = block.cells
-        if cells is None:
             # Happens if table/form processors didn't run
             return
-        prompt = self.gemini_rewriting_prompt + '```markdown\n`' + markdown_format(cells) + '`\n```\n'
-        image = self.extract_image(page, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
-            required=["corrected_markdown"],
             properties={
-                "corrected_markdown": content.Schema(
                     type=content.Type.STRING
                 )
             },
@@ -62,22 +86,20 @@ Output:
         response = self.model.generate_response(prompt, image, block, response_schema)
-        if not response or "corrected_markdown" not in response:
             block.update_metadata(llm_error_count=1)
             return
-        corrected_markdown = response["corrected_markdown"]
         # The original table is okay
-        if "no corrections" in corrected_markdown.lower():
             return
-        orig_cell_text = "".join([cell.text for cell in cells])
         # Potentially a partial response
-        if len(corrected_markdown) < len(orig_cell_text) * .5:
             block.update_metadata(llm_error_count=1)
             return
-        # Convert LLM markdown to html
-        block.html = markdown2.markdown(corrected_markdown)

 from marker.processors.llm import BaseLLMProcessor
 from google.ai.generativelanguage_v1beta.types import content
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 class LLMFormProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.Form,)
+    form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+You will receive an image of a text block and an html representation of the form in the image.
+Your task is to correct any errors in the html representation, and format it properly.
+Values and labels should appear in html tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.  Only use the tags `table, p, span, i, b, th, td, tr, and div`.  Do not omit any text from the form - make sure everything is included in the html representation.  It should be as faithful to the original form as possible.
 **Instructions:**
 1. Carefully examine the provided form block image.
+2. Analyze the html representation of the form.
+3. If the html representation is largely correct, then write "No corrections needed."
+4. If the html representation contains errors, generate the corrected html representation.
+5. Output only either the corrected html representation or "No corrections needed."
 **Example:**
 Input:
+```html
+<table>
+    <tr>
+        <td>Label 1</td>
+        <td>Label 2</td>
+        <td>Label 3</td>
+    </tr>
+    <tr>
+        <td>Value 1</td>
+        <td>Value 2</td>
+        <td>Value 3</td>
+    </tr>
+</table>
 ```
 Output:
+```html
+<table>
+    <tr>
+        <th>Labels</th>
+        <th>Values</th>
+    </tr>
+    <tr>
+        <td>Label 1</td>
+        <td>Value 1</td>
+    </tr>
+    <tr>
+        <td>Label 2</td>
+        <td>Value 2</td>
+    </tr>
+    <tr>
+        <td>Label 3</td>
+        <td>Value 3</td>
+    </tr>
+</table>
 ```
 **Input:**
+```html
+{block_html}
+```
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
+        children = block.contained_blocks(document, (BlockTypes.TableCell,))
+        if not children:
             # Happens if table/form processors didn't run
             return
+        block_html = block.render(document).html
+        prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
+        image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
+            required=["corrected_html"],
             properties={
+                "corrected_html": content.Schema(
                     type=content.Type.STRING
                 )
             },
         response = self.model.generate_response(prompt, image, block, response_schema)
+        if not response or "corrected_html" not in response:
             block.update_metadata(llm_error_count=1)
             return
+        corrected_html = response["corrected_html"]
         # The original table is okay
+        if "no corrections" in corrected_html.lower():
             return
         # Potentially a partial response
+        if len(corrected_html) < len(block_html) * .33:
             block.update_metadata(llm_error_count=1)
             return
+        corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
+        block.html = corrected_html

marker/processors/llm/llm_handwriting.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import markdown2
+from marker.processors.llm import BaseLLMProcessor
+from google.ai.generativelanguage_v1beta.types import content
+from marker.schema import BlockTypes
+from marker.schema.blocks import Equation
+from marker.schema.document import Document
+from marker.schema.groups.page import PageGroup
+from typing import Annotated
+class LLMHandwritingProcessor(BaseLLMProcessor):
+    block_types = (BlockTypes.Equation,)
+    min_handwriting_height: Annotated[
+        float,
+        "The minimum ratio between handwriting height and page height to consider for processing.",
+     ] = 0.1
+    handwriting_generation_prompt: Annotated[
+        str,
+        "The prompt to use for OCRing handwriting.",
+        "Default is a string containing the Gemini prompt."
+    ] = """You are an expert editor specializing in accurately reproducing text from images.
+You will receive an image of a text block, along with the text that can be extracted. Your task is to generate markdown to properly represent the content of the image.  Do not omit any text present in the image - make sure everything is included in the markdown representation.  The markdown representation should be as faithful to the original image as possible.
+Formatting should be in markdown, with the following rules:
+- * for italics, ** for bold, and ` for inline code.
+- Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
+- Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
+- Links should be formatted with [text](url).
+- Use ``` for code blocks.
+- Inline math should be formatted with <math>math expression</math>.
+- Display math should be formatted with <math display="block">math expression</math>.
+- Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right.  The headers should be "Labels" and "Values".  Other text in the form can appear between the tables.
+- Tables should be formatted with markdown tables, with the headers bolded.
+**Instructions:**
+1. Carefully examine the provided block image.
+2. Analyze the existing text representation.
+3. Output the markdown representing the content of the image.
+**Example:**
+Input:
+```text
+This i sm handwritting.
+```
+Output:
+```markdown
+This is some *handwriting*.
+```
+**Input:**
+```text
+{extracted_text}
+```
+"""
+    def process_rewriting(self, document: Document, page: PageGroup, block: Equation):
+        text = block.raw_text(document)
+        prompt = self.handwriting_generation_prompt.replace("{handwriting_text}", text)
+        image = self.extract_image(document, block)
+        response_schema = content.Schema(
+            type=content.Type.OBJECT,
+            enum=[],
+            required=["markdown"],
+            properties={
+                "markdown": content.Schema(
+                    type=content.Type.STRING
+                )
+            },
+        )
+        response = self.model.generate_response(prompt, image, block, response_schema)
+        if not response or "markdown" not in response:
+            block.update_metadata(llm_error_count=1)
+            return
+        markdown = response["markdown"]
+        if len(markdown) < len(text) * .5:
+            block.update_metadata(llm_error_count=1)
+            return
+        markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
+        block.html = markdown2.markdown(markdown)

marker/processors/llm/llm_image_description.py CHANGED Viewed

@@ -36,6 +36,9 @@ Apples, Bananas, Oranges
 Output:
 In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits.  The x-axis shows the types of fruits, and the y-axis shows the number of people.  The bar chart shows that most people prefer apples, followed by bananas and oranges.  20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
 **Input:**
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
@@ -44,8 +47,8 @@ In this figure, a bar chart titled "Fruit Preference Survey" is showing the numb
             # Since this processor replaces images with descriptions
             return
-        prompt = self.image_description_prompt + '```text\n`' + block.raw_text(document) + '`\n```\n'
-        image = self.extract_image(page, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],

 Output:
 In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits.  The x-axis shows the types of fruits, and the y-axis shows the number of people.  The bar chart shows that most people prefer apples, followed by bananas and oranges.  20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
 **Input:**
+```text
+{raw_text}
+```
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
             # Since this processor replaces images with descriptions
             return
+        prompt = self.image_description_prompt.replace("{raw_text}", block.raw_text(document))
+        image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],

marker/processors/llm/llm_table.py CHANGED Viewed

@@ -2,12 +2,10 @@ from typing import Annotated, List, Tuple
 from bs4 import BeautifulSoup
 from google.ai.generativelanguage_v1beta.types import content
-from tabled.formats import html_format
-from tabled.schema import SpanTableCell
 from marker.processors.llm import BaseLLMProcessor
 from marker.schema import BlockTypes
-from marker.schema.blocks import Block
 from marker.schema.document import Document
 from marker.schema.groups.page import PageGroup
 from marker.schema.polygon import PolygonBox
@@ -17,19 +15,26 @@ class LLMTableProcessor(BaseLLMProcessor):
     block_types: Annotated[
         Tuple[BlockTypes],
         "The block types to process.",
-    ] = (BlockTypes.Table,)
-    gemini_rewriting_prompt: Annotated[
         str,
         "The prompt to use for rewriting text.",
         "Default is a string containing the Gemini rewriting prompt."
     ] = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and an html representation of the table in the image.
 Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table as possible.
 **Instructions:**
 1. Carefully examine the provided text block image.
 2. Analyze the html representation of the table.
 3. If the html representation is largely correct, then write "No corrections needed."
-4. If the html representation contains errors, generate the corrected html representation.  Only use the tags th, td, tr, and table.  Only use the attributes colspan and rowspan if necessary.
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**
 Input:
@@ -52,16 +57,21 @@ Output:
 No corrections needed.
 ```
 **Input:**
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
-        cells = block.cells
-        if cells is None:
             # Happens if table/form processors didn't run
             return
-        prompt = self.gemini_rewriting_prompt + '```html\n`' + html_format(cells) + '`\n```\n'
-        image = self.extract_image(page, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
@@ -85,31 +95,49 @@ No corrections needed.
         if "no corrections" in corrected_html.lower():
             return
-        parsed_cells = self.parse_html_table(corrected_html, block)
         if len(parsed_cells) <= 1:
             block.update_metadata(llm_error_count=1)
             return
         parsed_cell_text = "".join([cell.text for cell in parsed_cells])
-        orig_cell_text = "".join([cell.text for cell in cells])
         # Potentially a partial response
         if len(parsed_cell_text) < len(orig_cell_text) * .5:
             block.update_metadata(llm_error_count=1)
             return
-        block.cells = parsed_cells
-    def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
         soup = BeautifulSoup(html_text, 'html.parser')
         table = soup.find('table')
         # Initialize grid
         rows = table.find_all('tr')
         cells = []
-        max_cols = max(len(row.find_all(['td', 'th'])) for row in rows)
-        if max_cols == 0:
-            return []
         grid = [[True] * max_cols for _ in range(len(rows))]
@@ -124,7 +152,7 @@ No corrections needed.
                     print("Table parsing warning: too many columns found")
                     break
-                cell_text = cell.text.strip()
                 rowspan = min(int(cell.get('rowspan', 1)), len(rows) - i)
                 colspan = min(int(cell.get('colspan', 1)), max_cols - cur_col)
                 cell_rows = list(range(i, i + rowspan))
@@ -146,11 +174,15 @@ No corrections needed.
                 ]
                 cell_polygon = PolygonBox.from_bbox(cell_bbox)
-                cell_obj = SpanTableCell(
                     text=cell_text,
-                    row_ids=cell_rows,
-                    col_ids=cell_cols,
-                    bbox=cell_polygon.bbox
                 )
                 cells.append(cell_obj)
                 cur_col += colspan

 from bs4 import BeautifulSoup
 from google.ai.generativelanguage_v1beta.types import content
 from marker.processors.llm import BaseLLMProcessor
 from marker.schema import BlockTypes
+from marker.schema.blocks import Block, TableCell
 from marker.schema.document import Document
 from marker.schema.groups.page import PageGroup
 from marker.schema.polygon import PolygonBox
     block_types: Annotated[
         Tuple[BlockTypes],
         "The block types to process.",
+    ] = (BlockTypes.Table, BlockTypes.TableOfContents)
+    table_rewriting_prompt: Annotated[
         str,
         "The prompt to use for rewriting text.",
         "Default is a string containing the Gemini rewriting prompt."
     ] = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and an html representation of the table in the image.
 Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table as possible.
+Some guidelines:
+- Make sure to reproduce the original values as faithfully as possible.
+- If you see any math in a table cell, fence it with the <math display="inline"> tag.  Block math should be fenced with <math display="block">.
+- Replace any images with a description, like "Image: [description]".
+- Only use the tags th, td, tr, span, i, b, math, and table.  Only use the attributes display, style, colspan, and rowspan if necessary.
 **Instructions:**
 1. Carefully examine the provided text block image.
 2. Analyze the html representation of the table.
 3. If the html representation is largely correct, then write "No corrections needed."
+4. If the html representation contains errors, generate the corrected html representation.
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**
 Input:
 No corrections needed.
 ```
 **Input:**
+```html
+{block_html}
+```
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
+        children = block.contained_blocks(document, (BlockTypes.TableCell,))
+        if not children:
             # Happens if table/form processors didn't run
             return
+        block_html = block.render(document).html
+        prompt = self.table_rewriting_prompt.replace("{block_html}", block_html)
+        image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],
         if "no corrections" in corrected_html.lower():
             return
+        corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
+        parsed_cells = self.parse_html_table(corrected_html, block, page)
         if len(parsed_cells) <= 1:
             block.update_metadata(llm_error_count=1)
             return
         parsed_cell_text = "".join([cell.text for cell in parsed_cells])
+        orig_cell_text = "".join([cell.text for cell in children])
         # Potentially a partial response
         if len(parsed_cell_text) < len(orig_cell_text) * .5:
             block.update_metadata(llm_error_count=1)
             return
+        block.structure = []
+        for cell in parsed_cells:
+            page.add_full_block(cell)
+            block.add_structure(cell)
+    @staticmethod
+    def get_cell_text(element, keep_tags=('br',)):
+        for tag in element.find_all(True):
+            if tag.name not in keep_tags:
+                tag.unwrap()
+        return element.decode_contents().replace("<br>", "\n")
+    def parse_html_table(self, html_text: str, block: Block, page: PageGroup) -> List[TableCell]:
         soup = BeautifulSoup(html_text, 'html.parser')
         table = soup.find('table')
         # Initialize grid
         rows = table.find_all('tr')
         cells = []
+        # Find maximum number of columns in colspan-aware way
+        max_cols = 0
+        for row in rows:
+            row_tds = row.find_all(['td', 'th'])
+            curr_cols = 0
+            for cell in row_tds:
+                colspan = int(cell.get('colspan', 1))
+                curr_cols += colspan
+            if curr_cols > max_cols:
+                max_cols = curr_cols
         grid = [[True] * max_cols for _ in range(len(rows))]
                     print("Table parsing warning: too many columns found")
                     break
+                cell_text = self.get_cell_text(cell).strip()
                 rowspan = min(int(cell.get('rowspan', 1)), len(rows) - i)
                 colspan = min(int(cell.get('colspan', 1)), max_cols - cur_col)
                 cell_rows = list(range(i, i + rowspan))
                 ]
                 cell_polygon = PolygonBox.from_bbox(cell_bbox)
+                cell_obj = TableCell(
                     text=cell_text,
+                    row_id=i,
+                    col_id=cur_col,
+                    rowspan=rowspan,
+                    colspan=colspan,
+                    is_header=cell.name == 'th',
+                    polygon=cell_polygon,
+                    page_id=page.page_id,
                 )
                 cells.append(cell_obj)
                 cur_col += colspan

marker/processors/llm/llm_table_merge.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Annotated, List, Tuple, Literal
+from google.ai.generativelanguage_v1beta.types import content
+from tqdm import tqdm
+from PIL import Image
+from marker.processors.llm import BaseLLMProcessor
+from marker.schema import BlockTypes
+from marker.schema.blocks import Block, TableCell
+from marker.schema.document import Document
+class LLMTableMergeProcessor(BaseLLMProcessor):
+    block_types: Annotated[
+        Tuple[BlockTypes],
+        "The block types to process.",
+    ] = (BlockTypes.Table, BlockTypes.TableOfContents)
+    table_height_threshold: Annotated[
+        float,
+        "The minimum height ratio relative to the page for the first table in a pair to be considered for merging.",
+    ] = 0.6
+    table_start_threshold: Annotated[
+        float,
+        "The maximum percentage down the page the second table can start to be considered for merging."
+    ] = 0.2
+    vertical_table_height_threshold: Annotated[
+        float,
+        "The height tolerance for 2 adjacent tables to be merged into one."
+    ] = 0.25
+    vertical_table_distance_threshold: Annotated[
+        int,
+        "The maximum distance between table edges for adjacency."
+    ] = 20
+    column_gap_threshold: Annotated[
+        int,
+        "The maximum gap between columns to merge tables"
+    ] = 50
+    table_merge_prompt: Annotated[
+        str,
+        "The prompt to use for rewriting text.",
+        "Default is a string containing the Gemini rewriting prompt."
+    ] = """You're a text correction expert specializing in accurately reproducing tables from PDFs.
+You'll receive two images of tables from successive pages of a PDF.  Table 1 is from the first page, and Table 2 is from the second page.  Both tables may actually be part of the same larger table. Your job is to decide if Table 2 should be merged with Table 1, and how they should be joined.  The should only be merged if they're part of the same larger table, and Table 2 cannot be interpreted without merging.
+You'll specify your judgement in json format - first whether Table 2 should be merged with Table 1, then the direction of the merge, either `bottom` or `right`.  A bottom merge means that the rows of Table 2 are joined to the rows of Table 1. A right merge means that the columns of Table 2 are joined to the columns of Table 1.  (bottom merge is equal to np.vstack, right merge is equal to np.hstack)
+Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
+Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging.
+**Instructions:**
+1. Carefully examine the provided table images.  Table 1 is the first image, and Table 2 is the second image.
+2. Examine the provided html representations of Table 1 and Table 2.
+3. Write a description of Table 1.
+4. Write a description of Table 2.
+5. Analyze whether Table 2 should be merged into Table 1, and write an explanation.
+6. Output your decision on whether they should be merged, and merge direction.
+**Example:**
+Input:
+Table 1
+```html
+<table>
+    <tr>
+        <th>Name</th>
+        <th>Age</th>
+        <th>City</th>
+        <th>State</th>
+    </tr>
+    <tr>
+        <td>John</td>
+        <td>25</td>
+        <td>Chicago</td>
+        <td>IL</td>
+    </tr>
+```
+Table 2
+```html
+<table>
+    <tr>
+        <td>Jane</td>
+        <td>30</td>
+        <td>Los Angeles</td>
+        <td>CA</td>
+    </tr>
+```
+Output:
+```json
+{
+    "table1_description": "Table 1 has 4 headers, and 1 row.  The headers are Name, Age, City, and State.",
+    "table2_description": "Table 2 has no headers, but the values appear to represent a person's name, age, city, and state.",
+    "explanation": "The values in Table 2 match the headers in Table 1, and Table 2 has no headers. Table 2 should be merged to the bottom of Table 1.",
+    "merge": "true",
+    "direction": "bottom"
+}
+```
+**Input:**
+Table 1
+```html
+{{table1}}
+Table 2
+```html
+{{table2}}
+```
+"""
+    @staticmethod
+    def get_row_count(cells: List[TableCell]):
+        max_rows = None
+        for col_id in set([cell.col_id for cell in cells]):
+            col_cells = [cell for cell in cells if cell.col_id == col_id]
+            rows = 0
+            for cell in col_cells:
+                rows += cell.rowspan
+            if max_rows is None or rows > max_rows:
+                max_rows = rows
+        return max_rows
+    @staticmethod
+    def get_column_count(cells: List[TableCell]):
+        max_cols = None
+        for row_id in set([cell.row_id for cell in cells]):
+            row_cells = [cell for cell in cells if cell.row_id == row_id]
+            cols = 0
+            for cell in row_cells:
+                cols += cell.colspan
+            if max_cols is None or cols > max_cols:
+                max_cols = cols
+        return max_cols
+    def rewrite_blocks(self, document: Document):
+        pbar = tqdm(desc=f"{self.__class__.__name__} running")
+        table_runs = []
+        table_run = []
+        prev_block = None
+        prev_page_block_count = None
+        for page in document.pages:
+            page_blocks = page.contained_blocks(document, self.block_types)
+            for block in page_blocks:
+                merge_condition = False
+                if prev_block is not None:
+                    prev_cells = prev_block.contained_blocks(document, (BlockTypes.TableCell,))
+                    curr_cells = block.contained_blocks(document, (BlockTypes.TableCell,))
+                    row_match = abs(self.get_row_count(prev_cells) - self.get_row_count(curr_cells)) < 5, # Similar number of rows
+                    col_match = abs(self.get_column_count(prev_cells) - self.get_column_count(curr_cells)) < 2
+                    subsequent_page_table = all([
+                        prev_block.page_id == block.page_id - 1, # Subsequent pages
+                        max(prev_block.polygon.height / page.polygon.height,
+                            block.polygon.height / page.polygon.height) > self.table_height_threshold, # Take up most of the page height
+                            (len(page_blocks) == 1 or prev_page_block_count == 1), # Only table on the page
+                            (row_match or col_match)
+                        ])
+                    same_page_vertical_table = all([
+                        prev_block.page_id == block.page_id, # On the same page
+                        (1 - self.vertical_table_height_threshold) < prev_block.polygon.height / block.polygon.height < (1 + self.vertical_table_height_threshold), # Similar height
+                        abs(block.polygon.x_start - prev_block.polygon.x_end) < self.vertical_table_distance_threshold, # Close together in x
+                        abs(block.polygon.y_start - prev_block.polygon.y_start) < self.vertical_table_distance_threshold, # Close together in y
+                        row_match
+                    ])
+                    same_page_new_column = all([
+                        prev_block.page_id == block.page_id, # On the same page
+                        abs(block.polygon.x_start - prev_block.polygon.x_end) < self.column_gap_threshold,
+                        block.polygon.y_start < prev_block.polygon.y_end,
+                        block.polygon.width * (1 - self.vertical_table_height_threshold) < prev_block.polygon.width  < block.polygon.width * (1 + self.vertical_table_height_threshold), # Similar width
+                        col_match
+                    ])
+                    merge_condition = any([subsequent_page_table, same_page_vertical_table, same_page_new_column])
+                if prev_block is not None and merge_condition:
+                    if prev_block not in table_run:
+                        table_run.append(prev_block)
+                    table_run.append(block)
+                else:
+                    if table_run:
+                        table_runs.append(table_run)
+                    table_run = []
+                prev_block = block
+            prev_page_block_count = len(page_blocks)
+        if table_run:
+            table_runs.append(table_run)
+        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
+            for future in as_completed([
+                executor.submit(self.process_rewriting, document, blocks)
+                for blocks in table_runs
+            ]):
+                future.result()  # Raise exceptions if any occurred
+                pbar.update(1)
+        pbar.close()
+    def process_rewriting(self, document: Document, blocks: List[Block]):
+        if len(blocks) < 2:
+            # Can't merge single tables
+            return
+        start_block = blocks[0]
+        for i in range(1, len(blocks)):
+            curr_block = blocks[i]
+            children = start_block.contained_blocks(document, (BlockTypes.TableCell,))
+            children_curr = curr_block.contained_blocks(document, (BlockTypes.TableCell,))
+            if not children or not children_curr:
+                # Happens if table/form processors didn't run
+                break
+            start_image = start_block.get_image(document, highres=False)
+            curr_image = curr_block.get_image(document, highres=False)
+            start_html = start_block.render(document).html
+            curr_html = curr_block.render(document).html
+            prompt = self.table_merge_prompt.replace("{{table1}}", start_html).replace("{{table2}}", curr_html)
+            response_schema = content.Schema(
+                type=content.Type.OBJECT,
+                enum=[],
+                required=["table1_description", "table2_description", "explanation", "merge", "direction"],
+                properties={
+                    "table1_description": content.Schema(
+                        type=content.Type.STRING
+                    ),
+                    "table2_description": content.Schema(
+                        type=content.Type.STRING
+                    ),
+                    "explanation": content.Schema(
+                        type=content.Type.STRING
+                    ),
+                    "merge": content.Schema(
+                        type=content.Type.STRING,
+                        enum=["true", "false"]
+                    ),
+                    "direction": content.Schema(
+                        type=content.Type.STRING,
+                        enum=["bottom", "right"]
+                    ),
+                },
+            )
+            response = self.model.generate_response(
+                prompt,
+                [start_image, curr_image],
+                curr_block,
+                response_schema
+            )
+            if not response or ("direction" not in response or "merge" not in response):
+                curr_block.update_metadata(llm_error_count=1)
+                break
+            merge = response["merge"]
+            # The original table is okay
+            if "true" not in merge:
+                start_block = curr_block
+                continue
+            # Merge the cells and images of the tables
+            direction = response["direction"]
+            if not self.validate_merge(children, children_curr, direction):
+                start_block = curr_block
+                continue
+            merged_image = self.join_images(start_image, curr_image, direction)
+            merged_cells = self.join_cells(children, children_curr, direction)
+            curr_block.structure = []
+            start_block.structure = [b.id for b in merged_cells]
+            start_block.lowres_image = merged_image
+    def validate_merge(self, cells1: List[TableCell], cells2: List[TableCell], direction: Literal['right', 'bottom'] = 'right'):
+        if direction == "right":
+            # Check if the number of rows is the same
+            cells1_row_count = self.get_row_count(cells1)
+            cells2_row_count = self.get_row_count(cells2)
+            return abs(cells1_row_count - cells2_row_count) < 5
+        elif direction == "bottom":
+            # Check if the number of columns is the same
+            cells1_col_count = self.get_column_count(cells1)
+            cells2_col_count = self.get_column_count(cells2)
+            return abs(cells1_col_count - cells2_col_count) < 2
+    def join_cells(self, cells1: List[TableCell], cells2: List[TableCell], direction: Literal['right', 'bottom'] = 'right') -> List[TableCell]:
+        if direction == 'right':
+            # Shift columns right
+            col_count = self.get_column_count(cells1)
+            for cell in cells2:
+                cell.col_id += col_count
+            new_cells = cells1 + cells2
+        else:
+            # Shift rows up
+            row_count = self.get_row_count(cells1)
+            for cell in cells2:
+                cell.row_id += row_count
+            new_cells = cells1 + cells2
+        return new_cells
+    @staticmethod
+    def join_images(image1: Image.Image, image2: Image.Image, direction: Literal['right', 'bottom'] = 'right') -> Image.Image:
+        # Get dimensions
+        w1, h1 = image1.size
+        w2, h2 = image2.size
+        if direction == 'right':
+            new_height = max(h1, h2)
+            new_width = w1 + w2
+            new_img = Image.new('RGB', (new_width, new_height), 'white')
+            new_img.paste(image1, (0, 0))
+            new_img.paste(image2, (w1, 0))
+        else:
+            new_width = max(w1, w2)
+            new_height = h1 + h2
+            new_img = Image.new('RGB', (new_width, new_height), 'white')
+            new_img.paste(image1, (0, 0))
+            new_img.paste(image2, (0, h1))
+        return new_img

marker/processors/llm/llm_text.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from marker.processors.llm import BaseLLMProcessor
 from bs4 import BeautifulSoup
@@ -13,10 +14,10 @@ from marker.schema.text.span import Span
 class LLMTextProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.TextInlineMath, BlockTypes.Handwriting)
-    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
 Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
-The number of output lines MUST match the number of input lines.
 **Instructions:**
@@ -64,7 +65,9 @@ Output:
 ```
 **Input:**
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
@@ -73,8 +76,8 @@ Output:
         text_lines = block.contained_blocks(document, (BlockTypes.Line,))
         extracted_lines = [line.formatted_text(document) for line in text_lines]
-        prompt = self.gemini_rewriting_prompt + '```json\n`' + json.dumps({"extracted_lines": extracted_lines}, indent=2) + '`\n```\n'
-        image = self.extract_image(page, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],

 import json
+import textwrap
 from marker.processors.llm import BaseLLMProcessor
 from bs4 import BeautifulSoup
 class LLMTextProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.TextInlineMath, BlockTypes.Handwriting)
+    text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
 Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
+The number of output lines MUST match the number of input lines.  Stay as faithful to the original text as possible.
 **Instructions:**
 ```
 **Input:**
+```json
+{extracted_lines}
+```
 """
     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
         text_lines = block.contained_blocks(document, (BlockTypes.Line,))
         extracted_lines = [line.formatted_text(document) for line in text_lines]
+        prompt = self.text_math_rewriting_prompt.replace("{extracted_lines}", json.dumps({"extracted_lines": extracted_lines}, indent=2))
+        image = self.extract_image(document, block)
         response_schema = content.Schema(
             type=content.Type.OBJECT,
             enum=[],

marker/processors/llm/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import time
 import PIL
 import google.generativeai as genai
@@ -25,17 +26,19 @@ class GoogleModel:
     def generate_response(
             self,
             prompt: str,
-            image: PIL.Image.Image,
             block: Block,
             response_schema: content.Schema,
             max_retries: int = 3,
             timeout: int = 60
     ):
         tries = 0
         while tries < max_retries:
             try:
                 responses = self.model.generate_content(
-                    [prompt, image],
                     stream=False,
                     generation_config={
                         "temperature": 0,

 import json
 import time
+from typing import List
 import PIL
 import google.generativeai as genai
     def generate_response(
             self,
             prompt: str,
+            image: PIL.Image.Image | List[PIL.Image.Image],
             block: Block,
             response_schema: content.Schema,
             max_retries: int = 3,
             timeout: int = 60
     ):
+        if not isinstance(image, list):
+            image = [image]
         tries = 0
         while tries < max_retries:
             try:
                 responses = self.model.generate_content(
+                    image + [prompt], # According to gemini docs, it performs better if the image is the first element
                     stream=False,
                     generation_config={
                         "temperature": 0,

marker/processors/table.py CHANGED Viewed

@@ -1,18 +1,22 @@
-from typing import Annotated
 from ftfy import fix_text
-from surya.input.pdflines import get_page_text_lines
-from surya.model.detection.model import EfficientViTForSemanticSegmentation
-from surya.model.recognition.encoderdecoder import OCREncoderDecoderModel
-from surya.model.table_rec.encoderdecoder import TableRecEncoderDecoderModel
-from tabled.assignment import assign_rows_columns
-from tabled.inference.recognition import get_cells, recognize_tables
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 from marker.settings import settings
 class TableProcessor(BaseProcessor):
@@ -42,9 +46,9 @@ class TableProcessor(BaseProcessor):
     def __init__(
         self,
-        detection_model: EfficientViTForSemanticSegmentation,
-        recognition_model: OCREncoderDecoderModel,
-        table_rec_model: TableRecEncoderDecoderModel,
         config=None
     ):
         super().__init__(config)
@@ -59,51 +63,207 @@ class TableProcessor(BaseProcessor):
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
-                image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
-                image = page.highres_image.crop(image_poly.bbox).convert("RGB")
-                if block.text_extraction_method == "surya":
-                    text_lines = None
-                else:
-                    text_lines = get_page_text_lines(
-                        filepath,
-                        [page.page_id],
-                        [page.highres_image.size],
-                        flatten_pdf=True
-                    )[0]
                 table_data.append({
                     "block_id": block.id,
                     "table_image": image,
                     "table_bbox": image_poly.bbox,
-                    "text_lines": text_lines,
-                    "img_size": page.highres_image.size
                 })
-        lst_format = [[t[key] for t in table_data] for key in ["table_image", "table_bbox", "img_size", "text_lines"]]
-        cells, needs_ocr = get_cells(
-            *lst_format,
-            [self.detection_model, self.detection_model.processor],
-            detect_boxes=self.detect_boxes,
-            detector_batch_size=self.get_detector_batch_size()
-        )
-        tables = recognize_tables(
             [t["table_image"] for t in table_data],
-            cells,
-            needs_ocr,
-            [self.table_rec_model, self.table_rec_model.processor, self.recognition_model, self.recognition_model.processor],
-            table_rec_batch_size=self.get_table_rec_batch_size(),
-            ocr_batch_size=self.get_recognition_batch_size()
         )
-        for table_d, table_res in zip(table_data, tables):
-            block = document.get_block(table_d["block_id"])
-            cells = assign_rows_columns(table_res, table_d["img_size"])
-            for cell in cells:
-                cell.text = fix_text(cell.text)
-            block.cells = cells
     def get_detector_batch_size(self):
         if self.detector_batch_size is not None:

+import re
+from collections import defaultdict
+from copy import deepcopy
+from typing import Annotated, List
 from ftfy import fix_text
+from surya.detection import DetectionPredictor
+from surya.recognition import RecognitionPredictor, OCRResult
+from surya.table_rec import TableRecPredictor
+from surya.table_rec.schema import TableResult, TableCell as SuryaTableCell
+from pdftext.extraction import table_output
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
+from marker.schema.blocks.tablecell import TableCell
 from marker.schema.document import Document
+from marker.schema.polygon import PolygonBox
 from marker.settings import settings
+from marker.util import matrix_intersection_area
 class TableProcessor(BaseProcessor):
     def __init__(
         self,
+        detection_model: DetectionPredictor,
+        recognition_model: RecognitionPredictor,
+        table_rec_model: TableRecPredictor,
         config=None
     ):
         super().__init__(config)
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
+                image = block.get_image(document, highres=True, expansion=(.01, .01))
+                image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
                 table_data.append({
                     "block_id": block.id,
+                    "page_id": page.page_id,
                     "table_image": image,
                     "table_bbox": image_poly.bbox,
+                    "img_size": page.get_image(highres=True).size,
+                    "ocr_block": page.text_extraction_method == "surya",
                 })
+        extract_blocks = [t for t in table_data if not t["ocr_block"]]
+        self.assign_pdftext_lines(extract_blocks, filepath) # Handle tables where good text exists in the PDF
+        ocr_blocks = [t for t in table_data if t["ocr_block"]]
+        self.assign_ocr_lines(ocr_blocks)  # Handle tables where OCR is needed
+        assert all("table_text_lines" in t for t in table_data), "All table data must have table cells"
+        tables: List[TableResult] = self.table_rec_model(
             [t["table_image"] for t in table_data],
+            batch_size=self.get_table_rec_batch_size()
         )
+        self.assign_text_to_cells(tables, table_data)
+        self.split_combined_rows(tables) # Split up rows that were combined
+        # Assign table cells to the table
+        table_idx = 0
+        for page in document.pages:
+            for block in page.contained_blocks(document, self.block_types):
+                block.structure = [] # Remove any existing lines, spans, etc.
+                cells: List[SuryaTableCell] = tables[table_idx].cells
+                for cell in cells:
+                    # Rescale the cell polygon to the page size
+                    cell_polygon = PolygonBox(polygon=cell.polygon).rescale(page.get_image(highres=True).size, page.polygon.size)
+                    cell_block = TableCell(
+                        polygon=cell_polygon,
+                        text=self.finalize_cell_text(cell),
+                        rowspan=cell.rowspan,
+                        colspan=cell.colspan,
+                        row_id=cell.row_id,
+                        col_id=cell.col_id,
+                        is_header=bool(cell.is_header),
+                        page_id=page.page_id,
+                    )
+                    page.add_full_block(cell_block)
+                    block.add_structure(cell_block)
+                table_idx += 1
+    def finalize_cell_text(self, cell: SuryaTableCell):
+        text = "\n".join([t["text"].strip() for t in cell.text_lines]) if cell.text_lines else ""
+        text = re.sub(r"(\s\.){2,}", "", text)  # Replace . . .
+        text = re.sub(r"\.{2,}", "", text)  # Replace ..., like in table of contents
+        return self.normalize_spaces(fix_text(text))
+    @staticmethod
+    def normalize_spaces(text):
+        space_chars = [
+            '\u2003',  # em space
+            '\u2002',  # en space
+            '\u00A0',  # non-breaking space
+            '\u200B',  # zero-width space
+            '\u3000',  # ideographic space
+        ]
+        for space in space_chars:
+            text = text.replace(space, ' ')
+        return text
+    def split_combined_rows(self, tables: List[TableResult]):
+        for table in tables:
+            if len(table.cells) == 0:
+                # Skip empty tables
+                continue
+            unique_rows = sorted(list(set([c.row_id for c in table.cells])))
+            new_cells = []
+            shift_up = 0
+            max_cell_id = max([c.cell_id for c in table.cells])
+            new_cell_count = 0
+            for row in unique_rows:
+                # Cells in this row
+                # Deepcopy is because we do an in-place mutation later, and that can cause rows to shift to match rows in unique_rows
+                # making them be processed twice
+                row_cells = deepcopy([c for c in table.cells if c.row_id == row])
+                rowspans = [c.rowspan for c in row_cells]
+                line_lens = [len(c.text_lines) if isinstance(c.text_lines, list) else 1 for c in row_cells]
+                # Other cells that span into this row
+                rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
+                should_split = all([
+                    len(row_cells) > 0,
+                    len(rowspan_cells) == 0,
+                    all([r == 1 for r in rowspans]),
+                    all([l > 1 for l in line_lens]),
+                    all([l == line_lens[0] for l in line_lens])
+                ])
+                if should_split:
+                    for i in range(0, line_lens[0]):
+                        for cell in row_cells:
+                            line = cell.text_lines[i]
+                            cell_id = max_cell_id + new_cell_count
+                            new_cells.append(
+                                SuryaTableCell(
+                                    polygon=line["bbox"],
+                                    text_lines=[line],
+                                    rowspan=1,
+                                    colspan=cell.colspan,
+                                    row_id=cell.row_id + shift_up + i,
+                                    col_id=cell.col_id,
+                                    is_header=cell.is_header and i == 0, # Only first line is header
+                                    within_row_id=cell.within_row_id,
+                                    cell_id=cell_id
+                                )
+                            )
+                            new_cell_count += 1
+                    # For each new row we add, shift up subsequent rows
+                    shift_up += line_lens[0] - 1
+                else:
+                    for cell in row_cells:
+                        cell.row_id += shift_up
+                        new_cells.append(cell)
+            # Only update the cells if we added new cells
+            if len(new_cells) > len(table.cells):
+                table.cells = new_cells
+    def assign_text_to_cells(self, tables: List[TableResult], table_data: list):
+        for table_result, table_page_data in zip(tables, table_data):
+            table_text_lines = table_page_data["table_text_lines"]
+            table_cells: List[SuryaTableCell] = table_result.cells
+            text_line_bboxes = [t["bbox"] for t in table_text_lines]
+            table_cell_bboxes = [c.bbox for c in table_cells]
+            intersection_matrix = matrix_intersection_area(text_line_bboxes, table_cell_bboxes)
+            cell_text = defaultdict(list)
+            for text_line_idx, table_text_line in enumerate(table_text_lines):
+                intersections = intersection_matrix[text_line_idx]
+                if intersections.sum() == 0:
+                    continue
+                max_intersection = intersections.argmax()
+                cell_text[max_intersection].append(table_text_line)
+            for k in cell_text:
+                # TODO: see if the text needs to be sorted (based on rotation)
+                text = cell_text[k]
+                assert all("text" in t for t in text), "All text lines must have text"
+                assert all("bbox" in t for t in text), "All text lines must have a bbox"
+                table_cells[k].text_lines = text
+    def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
+        table_inputs = []
+        unique_pages = list(set([t["page_id"] for t in extract_blocks]))
+        if len(unique_pages) == 0:
+            return
+        for page in unique_pages:
+            tables = []
+            img_size = None
+            for block in extract_blocks:
+                if block["page_id"] == page:
+                    tables.append(block["table_bbox"])
+                    img_size = block["img_size"]
+            table_inputs.append({
+                "tables": tables,
+                "img_size": img_size
+            })
+        cell_text = table_output(filepath, table_inputs, page_range=unique_pages)
+        assert len(cell_text) == len(unique_pages), "Number of pages and table inputs must match"
+        for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)):
+            table_idx = 0
+            for block in extract_blocks:
+                if block["page_id"] == pnum:
+                    block["table_text_lines"] = page_tables[table_idx]
+                    table_idx += 1
+            assert table_idx == len(page_tables), "Number of tables and table inputs must match"
+    def assign_ocr_lines(self, ocr_blocks: list):
+        det_images = [t["table_image"] for t in ocr_blocks]
+        ocr_results: List[OCRResult] = self.recognition_model(
+            det_images,
+            [None] * len(det_images),
+            self.detection_model,
+            recognition_batch_size=self.get_recognition_batch_size(),
+            detection_batch_size=self.get_detector_batch_size()
+        )
+        for block, ocr_res in zip(ocr_blocks, ocr_results):
+            table_cells = []
+            for line in ocr_res.text_lines:
+                # Don't need to correct back to image size
+                # Table rec boxes are relative to the table
+                table_cells.append({
+                    "bbox": line.bbox,
+                    "text": line.text
+                })
+            block["table_text_lines"] = table_cells
     def get_detector_batch_size(self):
         if self.detector_batch_size is not None:

marker/providers/__init__.py CHANGED Viewed

@@ -3,6 +3,9 @@ from typing import List, Optional, Dict
 from PIL import Image
 from pydantic import BaseModel
 from marker.schema.text import Span
 from marker.schema.text.line import Line
 from marker.util import assign_config
@@ -29,8 +32,17 @@ class BaseProvider:
     def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
         pass
-    def get_page_bbox(self, idx: int) -> List[float]:
         pass
     def get_page_lines(self, idx: int) -> List[Line]:
         pass

 from PIL import Image
 from pydantic import BaseModel
+from pdftext.schema import Reference
+from marker.schema.polygon import PolygonBox
 from marker.schema.text import Span
 from marker.schema.text.line import Line
 from marker.util import assign_config
     def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
         pass
+    def get_page_bbox(self, idx: int) -> PolygonBox | None:
         pass
     def get_page_lines(self, idx: int) -> List[Line]:
         pass
+    def get_page_refs(self, idx: int) -> List[Reference]:
+        pass
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        raise NotImplementedError

marker/providers/image.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import List, Annotated, Optional
+from PIL import Image
+from marker.providers import ProviderPageLines, BaseProvider
+from marker.schema.polygon import PolygonBox
+from marker.schema.text import Line
+from pdftext.schema import Reference
+class ImageProvider(BaseProvider):
+    page_range: Annotated[
+        Optional[List[int]],
+        "The range of pages to process.",
+        "Default is None, which will process all pages."
+    ] = None
+    image_count: int = 1
+    def __init__(self, filepath: str, config=None):
+        super().__init__(filepath, config)
+        self.images = [Image.open(filepath)]
+        self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)}
+        if self.page_range is None:
+            self.page_range = range(self.image_count)
+        assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, \
+            f"Invalid page range, values must be between 0 and {len(self.doc) - 1}.  Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
+        self.page_bboxes = {i: [0, 0, self.images[i].size[0], self.images[i].size[1]] for i in self.page_range}
+    def __len__(self):
+        return self.image_count
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+    def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
+        return [self.images[i] for i in idxs]
+    def get_page_bbox(self, idx: int) -> PolygonBox | None:
+        bbox = self.page_bboxes[idx]
+        if bbox:
+            return PolygonBox.from_bbox(bbox)
+    def get_page_lines(self, idx: int) -> List[Line]:
+        return self.page_lines[idx]
+    def get_page_refs(self, idx: int) -> List[Reference]:
+        return []

marker/providers/pdf.py CHANGED Viewed

@@ -9,6 +9,7 @@ from ftfy import fix_text
 from pdftext.extraction import dictionary_output
 from pdftext.schema import Reference
 from PIL import Image
 from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
 from marker.providers.utils import alphanum_ratio
@@ -91,9 +92,6 @@ class PdfProvider(BaseProvider):
         atexit.register(self.cleanup_pdf_doc)
-    def __enter__(self):
-        return self
     def __exit__(self, exc_type, exc_value, traceback):
         self.cleanup_pdf_doc()
@@ -155,6 +153,19 @@ class PdfProvider(BaseProvider):
             formats.add("italic")
         return formats
     def pdftext_extraction(self) -> ProviderPageLines:
         page_lines: ProviderPageLines = {}
         page_char_blocks = dictionary_output(
@@ -191,7 +202,7 @@ class PdfProvider(BaseProvider):
                         spans.append(
                             SpanClass(
                                 polygon=polygon,
-                                text=fix_text(span["text"]),
                                 font=font_name,
                                 font_weight=font_weight,
                                 font_size=font_size,
@@ -234,7 +245,11 @@ class PdfProvider(BaseProvider):
     def check_page(self, page_id: int) -> bool:
         page = self.doc.get_page(page_id)
         page_bbox = PolygonBox.from_bbox(page.get_bbox())
-        page_objs = list(page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_TEXT, pdfium_c.FPDF_PAGEOBJ_IMAGE]))
         # if we do not see any text objects in the pdf, we can skip this page
         if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
@@ -313,7 +328,7 @@ class PdfProvider(BaseProvider):
     def get_page_lines(self, idx: int) -> List[ProviderOutput]:
         return self.page_lines[idx]
-    def get_page_refs(self, idx: int):
         return self.page_refs[idx]
     @staticmethod

 from pdftext.extraction import dictionary_output
 from pdftext.schema import Reference
 from PIL import Image
+from pypdfium2 import PdfiumError
 from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
 from marker.providers.utils import alphanum_ratio
         atexit.register(self.cleanup_pdf_doc)
     def __exit__(self, exc_type, exc_value, traceback):
         self.cleanup_pdf_doc()
             formats.add("italic")
         return formats
+    @staticmethod
+    def normalize_spaces(text):
+        space_chars = [
+            '\u2003',  # em space
+            '\u2002',  # en space
+            '\u00A0',  # non-breaking space
+            '\u200B',  # zero-width space
+            '\u3000',  # ideographic space
+        ]
+        for space in space_chars:
+            text = text.replace(space, ' ')
+        return text
     def pdftext_extraction(self) -> ProviderPageLines:
         page_lines: ProviderPageLines = {}
         page_char_blocks = dictionary_output(
                         spans.append(
                             SpanClass(
                                 polygon=polygon,
+                                text=self.normalize_spaces(fix_text(span["text"])),
                                 font=font_name,
                                 font_weight=font_weight,
                                 font_size=font_size,
     def check_page(self, page_id: int) -> bool:
         page = self.doc.get_page(page_id)
         page_bbox = PolygonBox.from_bbox(page.get_bbox())
+        try:
+            page_objs = list(page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_TEXT, pdfium_c.FPDF_PAGEOBJ_IMAGE]))
+        except PdfiumError:
+            # Happens when pdfium fails to get the number of page objects
+            return False
         # if we do not see any text objects in the pdf, we can skip this page
         if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
     def get_page_lines(self, idx: int) -> List[ProviderOutput]:
         return self.page_lines[idx]
+    def get_page_refs(self, idx: int) -> List[Reference]:
         return self.page_refs[idx]
     @staticmethod

marker/providers/registry.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import filetype
+from marker.providers.image import ImageProvider
+from marker.providers.pdf import PdfProvider
+def provider_from_filepath(filepath: str):
+     kind = filetype.image_match(filepath)
+     if kind is not None:
+        return ImageProvider
+     return PdfProvider

marker/renderers/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import io
 import re
 from collections import Counter
-from typing import Annotated, Optional, Tuple
 from bs4 import BeautifulSoup
 from pydantic import BaseModel
@@ -17,6 +17,11 @@ from marker.util import assign_config
 class BaseRenderer:
     image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
     extract_images: Annotated[bool, "Extract images from the document."] = True
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
@@ -25,13 +30,10 @@ class BaseRenderer:
         # Children are in reading order
         raise NotImplementedError
-    @staticmethod
-    def extract_image(document: Document, image_id, to_base64=False):
         image_block = document.get_block(image_id)
-        page = document.get_page(image_block.page_id)
-        page_img = page.highres_image
-        image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
-        cropped = page_img.crop(image_box.bbox)
         if to_base64:
             image_buffer = io.BytesIO()
             cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT)
@@ -44,7 +46,11 @@ class BaseRenderer:
             return html
         def replace_whitespace(match):
-            return match.group(1)
         pattern = fr'</{tag}>(\s*)<{tag}>'

 import io
 import re
 from collections import Counter
+from typing import Annotated, Optional, Tuple, Literal
 from bs4 import BeautifulSoup
 from pydantic import BaseModel
 class BaseRenderer:
     image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
     extract_images: Annotated[bool, "Extract images from the document."] = True
+    image_extraction_mode: Annotated[
+        Literal["lowres", "highres"],
+        "The mode to use for extracting images.",
+    ] = "highres"
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
         # Children are in reading order
         raise NotImplementedError
+    def extract_image(self, document: Document, image_id, to_base64=False):
         image_block = document.get_block(image_id)
+        cropped = image_block.get_image(document, highres=self.image_extraction_mode == "highres")
         if to_base64:
             image_buffer = io.BytesIO()
             cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT)
             return html
         def replace_whitespace(match):
+            whitespace = match.group(1)
+            if len(whitespace) == 0:
+                return ""
+            else:
+                return " "
         pattern = fr'</{tag}>(\s*)<{tag}>'

marker/renderers/html.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from PIL import Image
 from typing import Annotated, Literal, Tuple
@@ -35,17 +37,10 @@ class HTMLRenderer(BaseRenderer):
         bool,
         "Whether to paginate the output.",
     ] = False
-    image_extraction_mode: Annotated[
-        Literal["lowres", "highres"],
-        "The mode to use for extracting images.",
-    ] = "highres"
     def extract_image(self, document, image_id):
         image_block = document.get_block(image_id)
-        page = document.get_page(image_block.page_id)
-        page_img = page.lowres_image if self.image_extraction_mode == "lowres" else page.highres_image
-        image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
-        cropped = page_img.crop(image_box.bbox)
         return cropped
     def extract_html(self, document, document_output, level=0):
@@ -87,12 +82,25 @@ class HTMLRenderer(BaseRenderer):
         if level == 0:
             output = self.merge_consecutive_tags(output, 'b')
             output = self.merge_consecutive_tags(output, 'i')
         return output, images
     def __call__(self, document) -> HTMLOutput:
         document_output = document.render()
         full_html, images = self.extract_html(document, document_output)
         return HTMLOutput(
             html=full_html,
             images=images,

+import textwrap
 from PIL import Image
 from typing import Annotated, Literal, Tuple
         bool,
         "Whether to paginate the output.",
     ] = False
     def extract_image(self, document, image_id):
         image_block = document.get_block(image_id)
+        cropped = image_block.get_image(document, highres=self.image_extraction_mode == "highres")
         return cropped
     def extract_html(self, document, document_output, level=0):
         if level == 0:
             output = self.merge_consecutive_tags(output, 'b')
             output = self.merge_consecutive_tags(output, 'i')
+            output = textwrap.dedent(f"""
+            <!DOCTYPE html>
+            <html>
+                <head>
+                    <meta charset="utf-8" />
+                </head>
+                <body>
+                    {output}
+                </body>
+            </html>
+""")
         return output, images
     def __call__(self, document) -> HTMLOutput:
         document_output = document.render()
         full_html, images = self.extract_html(document, document_output)
+        soup = BeautifulSoup(full_html, 'html.parser')
+        full_html = soup.prettify() # Add indentation to the HTML
         return HTMLOutput(
             html=full_html,
             images=images,

marker/renderers/json.py CHANGED Viewed

@@ -14,6 +14,7 @@ class JSONBlockOutput(BaseModel):
     block_type: str
     html: str
     polygon: List[List[float]]
     children: List['JSONBlockOutput'] | None = None
     section_hierarchy: Dict[int, str] | None = None
     images: dict | None = None
@@ -52,6 +53,7 @@ class JSONRenderer(BaseRenderer):
             return JSONBlockOutput(
                 html=html,
                 polygon=block_output.polygon.polygon,
                 id=str(block_output.id),
                 block_type=str(block_output.id.block_type),
                 images=images,
@@ -66,6 +68,7 @@ class JSONRenderer(BaseRenderer):
             return JSONBlockOutput(
                 html=block_output.html,
                 polygon=block_output.polygon.polygon,
                 id=str(block_output.id),
                 block_type=str(block_output.id.block_type),
                 children=children,

     block_type: str
     html: str
     polygon: List[List[float]]
+    bbox: List[float]
     children: List['JSONBlockOutput'] | None = None
     section_hierarchy: Dict[int, str] | None = None
     images: dict | None = None
             return JSONBlockOutput(
                 html=html,
                 polygon=block_output.polygon.polygon,
+                bbox=block_output.polygon.bbox,
                 id=str(block_output.id),
                 block_type=str(block_output.id.block_type),
                 images=images,
             return JSONBlockOutput(
                 html=block_output.html,
                 polygon=block_output.polygon.polygon,
+                bbox=block_output.polygon.bbox,
                 id=str(block_output.id),
                 block_type=str(block_output.id.block_type),
                 children=children,

marker/renderers/markdown.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import re
 from typing import Annotated, Tuple
 import regex
@@ -13,7 +14,7 @@ from marker.schema.document import Document
 def cleanup_text(full_text):
     full_text = re.sub(r'\n{3,}', '\n\n', full_text)
     full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
-    return full_text
 class Markdownify(MarkdownConverter):
@@ -53,13 +54,88 @@ class Markdownify(MarkdownConverter):
         else:
             return "\n" + self.block_math_delimiters[0] + text + self.block_math_delimiters[1] + "\n"
-    def convert_td(self, el, text, convert_as_inline):
-        text = text.replace("|", " ").replace("\n", " ")
-        return super().convert_td(el, text, convert_as_inline)
-    def convert_th(self, el, text, convert_as_inline):
-        text = text.replace("|", " ").replace("\n", " ")
-        return super().convert_th(el, text, convert_as_inline)
     def convert_a(self, el, text, convert_as_inline):
         text = self.escape(text)

 import re
+from collections import defaultdict
 from typing import Annotated, Tuple
 import regex
 def cleanup_text(full_text):
     full_text = re.sub(r'\n{3,}', '\n\n', full_text)
     full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
+    return full_text.strip()
 class Markdownify(MarkdownConverter):
         else:
             return "\n" + self.block_math_delimiters[0] + text + self.block_math_delimiters[1] + "\n"
+    def convert_table(self, el, text, convert_as_inline):
+        total_rows = len(el.find_all('tr'))
+        colspans = []
+        rowspan_cols = defaultdict(int)
+        for i, row in enumerate(el.find_all('tr')):
+            row_cols = rowspan_cols[i]
+            for cell in row.find_all(['td', 'th']):
+                colspan = int(cell.get('colspan', 1))
+                row_cols += colspan
+                for r in range(int(cell.get('rowspan', 1)) - 1):
+                    rowspan_cols[i + r] += colspan # Add the colspan to the next rows, so they get the correct number of columns
+            colspans.append(row_cols)
+        total_cols = max(colspans)
+        grid = [[None for _ in range(total_cols)] for _ in range(total_rows)]
+        for row_idx, tr in enumerate(el.find_all('tr')):
+            col_idx = 0
+            for cell in tr.find_all(['td', 'th']):
+                # Skip filled positions
+                while col_idx < total_cols and grid[row_idx][col_idx] is not None:
+                    col_idx += 1
+                # Fill in grid
+                value = cell.get_text(strip=True).replace("\n", " ").replace("|", " ")
+                rowspan = int(cell.get('rowspan', 1))
+                colspan = int(cell.get('colspan', 1))
+                if col_idx >= total_cols:
+                    # Skip this cell if we're out of bounds
+                    continue
+                for r in range(rowspan):
+                    for c in range(colspan):
+                        try:
+                            if r == 0 and c == 0:
+                                grid[row_idx][col_idx] = value
+                            else:
+                                grid[row_idx + r][col_idx + c] = ''
+                        except IndexError:
+                            # Sometimes the colspan/rowspan predictions can overflow
+                            print(f"Overflow in columns: {col_idx + c} >= {total_cols}")
+                            continue
+                col_idx += colspan
+        markdown_lines = []
+        col_widths = [0] * total_cols
+        for row in grid:
+            for col_idx, cell in enumerate(row):
+                if cell is not None:
+                    col_widths[col_idx] = max(col_widths[col_idx], len(str(cell)))
+        add_header_line = lambda: markdown_lines.append('|' + '|'.join('-' * (width + 2) for width in col_widths) + '|')
+        # Generate markdown rows
+        added_header = False
+        for i, row in enumerate(grid):
+            is_empty_line = all(not cell for cell in row)
+            if is_empty_line and not added_header:
+                # Skip leading blank lines
+                continue
+            line = []
+            for col_idx, cell in enumerate(row):
+                if cell is None:
+                    cell = ''
+                padding = col_widths[col_idx] - len(str(cell))
+                line.append(f" {cell}{' ' * padding} ")
+            markdown_lines.append('|' + '|'.join(line) + '|')
+            if not added_header:
+                # Skip empty lines when adding the header row
+                add_header_line()
+                added_header = True
+        # Handle one row tables
+        if total_rows == 1:
+            add_header_line()
+        table_md = '\n'.join(markdown_lines)
+        return "\n\n" + table_md + "\n\n"
     def convert_a(self, el, text, convert_as_inline):
         text = self.escape(text)

marker/schema/__init__.py CHANGED Viewed

@@ -27,6 +27,7 @@ class BlockTypes(str, Enum):
     TableOfContents = auto()
     Document = auto()
     ComplexRegion = auto()
     Reference = auto()
     def __str__(self):

     TableOfContents = auto()
     Document = auto()
     ComplexRegion = auto()
+    TableCell = auto()
     Reference = auto()
     def __str__(self):

marker/schema/blocks/__init__.py CHANGED Viewed

@@ -18,4 +18,5 @@ from marker.schema.blocks.table import Table
 from marker.schema.blocks.text import Text
 from marker.schema.blocks.toc import TableOfContents
 from marker.schema.blocks.complexregion import ComplexRegion
 from marker.schema.blocks.reference import Reference

 from marker.schema.blocks.text import Text
 from marker.schema.blocks.toc import TableOfContents
 from marker.schema.blocks.complexregion import ComplexRegion
+from marker.schema.blocks.tablecell import TableCell
 from marker.schema.blocks.reference import Reference

marker/schema/blocks/base.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence
 from pydantic import BaseModel, ConfigDict, field_validator
 from marker.schema import BlockTypes
 from marker.schema.polygon import PolygonBox
@@ -71,6 +72,7 @@ class BlockId(BaseModel):
 class Block(BaseModel):
     polygon: PolygonBox
     block_type: Optional[BlockTypes] = None
     block_id: Optional[int] = None
     page_id: Optional[int] = None
@@ -81,6 +83,8 @@ class Block(BaseModel):
     source: Literal['layout', 'heuristics', 'processor'] = 'layout'
     top_k: Optional[Dict[BlockTypes, float]] = None
     metadata: BlockMetadata | None = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -97,6 +101,21 @@ class Block(BaseModel):
         block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
         return cls(**block_attrs)
     def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
         if self.structure is None:
             return []
@@ -163,7 +182,7 @@ class Block(BaseModel):
                 text += "\n"
         return text
-    def assemble_html(self, child_blocks: List[BlockOutput], parent_structure: Optional[List[str]] = None):
         if self.ignore_for_output:
             return ""
@@ -172,7 +191,8 @@ class Block(BaseModel):
             template += f"<content-ref src='{c.id}'></content-ref>"
         if self.replace_output_newlines:
-            template = "<p>" + template.replace("\n", " ") + "</p>"
         return template
@@ -205,7 +225,7 @@ class Block(BaseModel):
                     self.structure[i] = new_block.id
                     break
-    def render(self, document: Document, parent_structure: Optional[List[str]], section_hierarchy=None):
         child_content = []
         if section_hierarchy is None:
             section_hierarchy = {}
@@ -219,7 +239,7 @@ class Block(BaseModel):
                 child_content.append(rendered)
         return BlockOutput(
-            html=self.assemble_html(child_content, parent_structure),
             polygon=self.polygon,
             id=self.id,
             children=child_content,

 from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, Tuple
 from pydantic import BaseModel, ConfigDict, field_validator
+from PIL import Image
 from marker.schema import BlockTypes
 from marker.schema.polygon import PolygonBox
 class Block(BaseModel):
     polygon: PolygonBox
+    block_description: str
     block_type: Optional[BlockTypes] = None
     block_id: Optional[int] = None
     page_id: Optional[int] = None
     source: Literal['layout', 'heuristics', 'processor'] = 'layout'
     top_k: Optional[Dict[BlockTypes, float]] = None
     metadata: BlockMetadata | None = None
+    lowres_image: Image.Image | None = None
+    highres_image: Image.Image | None = None
     model_config = ConfigDict(arbitrary_types_allowed=True)
         block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
         return cls(**block_attrs)
+    def get_image(self, document: Document, highres: bool = False, expansion: Tuple[float, float] | None = None) -> Image.Image | None:
+        image = self.highres_image if highres else self.lowres_image
+        if image is None:
+            page = document.get_page(self.page_id)
+            page_image = page.highres_image if highres else page.lowres_image
+            # Scale to the image size
+            bbox = self.polygon.rescale((page.polygon.width, page.polygon.height), page_image.size)
+            if expansion:
+                bbox = bbox.expand(*expansion)
+            bbox = bbox.bbox
+            image = page_image.crop(bbox)
+        return image
     def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]:
         if self.structure is None:
             return []
                 text += "\n"
         return text
+    def assemble_html(self, document: Document, child_blocks: List[BlockOutput], parent_structure: Optional[List[str]] = None):
         if self.ignore_for_output:
             return ""
             template += f"<content-ref src='{c.id}'></content-ref>"
         if self.replace_output_newlines:
+            template = template.replace("\n", " ")
+            template = "<p>" + template + "</p>"
         return template
                     self.structure[i] = new_block.id
                     break
+    def render(self, document: Document, parent_structure: Optional[List[str]] = None, section_hierarchy: dict | None = None):
         child_content = []
         if section_hierarchy is None:
             section_hierarchy = {}
                 child_content.append(rendered)
         return BlockOutput(
+            html=self.assemble_html(document, child_content, parent_structure),
             polygon=self.polygon,
             id=self.id,
             children=child_content,

marker/schema/blocks/basetable.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import List
+from marker.schema import BlockTypes
+from marker.schema.blocks import Block, BlockOutput
+from marker.schema.blocks.tablecell import TableCell
+class BaseTable(Block):
+    block_type: BlockTypes | None = None
+    html: str | None = None
+    def format_cells(self, document, child_blocks):
+        child_cells: List[TableCell] = [document.get_block(c.id) for c in child_blocks]
+        unique_rows = sorted(list(set([c.row_id for c in child_cells])))
+        html_repr = "<table><tbody>"
+        for row_id in unique_rows:
+            row_cells = sorted([c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id)
+            html_repr += "<tr>"
+            for cell in row_cells:
+                html_repr += cell.assemble_html(document, child_blocks, None)
+            html_repr += "</tr>"
+        html_repr += "</tbody></table>"
+        return html_repr
+    def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None):
+        # Filter out the table cells, so they don't render twice
+        child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
+        template = super().assemble_html(document, child_ref_blocks, parent_structure)
+        if self.html:
+            # LLM processor
+            return template + self.html
+        elif len(child_blocks) > 0 and child_blocks[0].id.block_type == BlockTypes.TableCell:
+            # Table processor
+            return template + self.format_cells(document, child_blocks)
+        else:
+            # Default text lines and spans
+            return f"<p>{template}</p>"

marker/schema/blocks/caption.py CHANGED Viewed

@@ -4,4 +4,6 @@ from marker.schema.blocks import Block
 class Caption(Block):
     block_type: BlockTypes = BlockTypes.Caption
     replace_output_newlines: bool = True

 class Caption(Block):
     block_type: BlockTypes = BlockTypes.Caption
+    block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table.  "
     replace_output_newlines: bool = True

marker/schema/blocks/code.py CHANGED Viewed

@@ -7,8 +7,9 @@ from marker.schema.blocks import Block
 class Code(Block):
     block_type: BlockTypes = BlockTypes.Code
     code: str | None = None
-    def assemble_html(self, child_blocks, parent_structure):
         code = self.code or ""
         return (f"<pre>"
                 f"{html.escape(code)}"

 class Code(Block):
     block_type: BlockTypes = BlockTypes.Code
     code: str | None = None
+    block_description: str = "A programming code block."
+    def assemble_html(self, document, child_blocks, parent_structure):
         code = self.code or ""
         return (f"<pre>"
                 f"{html.escape(code)}"

marker/schema/blocks/complexregion.py CHANGED Viewed

@@ -5,10 +5,11 @@ from marker.schema.blocks import Block
 class ComplexRegion(Block):
     block_type: BlockTypes = BlockTypes.ComplexRegion
     html: str | None = None
-    def assemble_html(self, child_blocks, parent_structure):
         if self.html:
             return self.html
         else:
-            template = super().assemble_html(child_blocks, parent_structure)
             return f"<p>{template}</p>"

 class ComplexRegion(Block):
     block_type: BlockTypes = BlockTypes.ComplexRegion
     html: str | None = None
+    block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type."
+    def assemble_html(self, document, child_blocks, parent_structure):
         if self.html:
             return self.html
         else:
+            template = super().assemble_html(document, child_blocks, parent_structure)
             return f"<p>{template}</p>"

marker/schema/blocks/equation.py CHANGED Viewed

@@ -7,11 +7,12 @@ from marker.schema.blocks import Block
 class Equation(Block):
     block_type: BlockTypes = BlockTypes.Equation
     latex: str | None = None
-    def assemble_html(self, child_blocks, parent_structure=None):
         if self.latex:
             child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
-            html_out = super().assemble_html(child_ref_blocks, parent_structure)
             html_out += f"<p block-type='{self.block_type}'>"
             try:
@@ -33,7 +34,7 @@ class Equation(Block):
             html_out += "</p>"
             return html_out
         else:
-            template = super().assemble_html(child_blocks, parent_structure)
             return f"<p block-type='{self.block_type}'>{template}</p>"
     @staticmethod

 class Equation(Block):
     block_type: BlockTypes = BlockTypes.Equation
     latex: str | None = None
+    block_description: str = "A block math equation."
+    def assemble_html(self, document, child_blocks, parent_structure=None):
         if self.latex:
             child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
+            html_out = super().assemble_html(document, child_ref_blocks, parent_structure)
             html_out += f"<p block-type='{self.block_type}'>"
             try:
             html_out += "</p>"
             return html_out
         else:
+            template = super().assemble_html(document, child_blocks, parent_structure)
             return f"<p block-type='{self.block_type}'>{template}</p>"
     @staticmethod

marker/schema/blocks/figure.py CHANGED Viewed

@@ -5,10 +5,11 @@ from marker.schema.blocks import Block
 class Figure(Block):
     block_type: BlockTypes = BlockTypes.Figure
     description: str | None = None
-    def assemble_html(self, child_blocks, parent_structure):
         child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
-        html = super().assemble_html(child_ref_blocks, parent_structure)
         if self.description:
             html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
         return html

 class Figure(Block):
     block_type: BlockTypes = BlockTypes.Figure
     description: str | None = None
+    block_description: str = "A chart or other image that contains data."
+    def assemble_html(self, document, child_blocks, parent_structure):
         child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
+        html = super().assemble_html(document, child_ref_blocks, parent_structure)
         if self.description:
             html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
         return html

marker/schema/blocks/footnote.py CHANGED Viewed

@@ -4,4 +4,5 @@ from marker.schema.blocks import Block
 class Footnote(Block):
     block_type: BlockTypes = BlockTypes.Footnote
     replace_output_newlines: bool = True

 class Footnote(Block):
     block_type: BlockTypes = BlockTypes.Footnote
+    block_description: str = "A footnote that explains a term or concept in the document."
     replace_output_newlines: bool = True

marker/schema/blocks/form.py CHANGED Viewed

@@ -1,20 +1,9 @@
 from typing import List
-from tabled.formats import html_format
-from tabled.schema import SpanTableCell
 from marker.schema import BlockTypes
-from marker.schema.blocks import Block
-class Form(Block):
-    block_type: str = BlockTypes.Form
-    cells: List[SpanTableCell] | None = None
-    html: str | None = None
-    def assemble_html(self, child_blocks, parent_structure=None):
-        # Some processors convert the form to html
-        if self.html is not None:
-            return self.html
-        return str(html_format(self.cells))

 from typing import List
 from marker.schema import BlockTypes
+from marker.schema.blocks.basetable import BaseTable
+class Form(BaseTable):
+    block_type: BlockTypes = BlockTypes.Form
+    block_description: str = "A form, such as a tax form, that contains fields and labels.  It most likely doesn't have a table structure."

marker/schema/blocks/handwriting.py CHANGED Viewed

@@ -4,4 +4,12 @@ from marker.schema.blocks import Block
 class Handwriting(Block):
     block_type: BlockTypes = BlockTypes.Handwriting
     replace_output_newlines: bool = True

 class Handwriting(Block):
     block_type: BlockTypes = BlockTypes.Handwriting
+    block_description: str = "A region that contains handwriting."
+    html: str | None = None
     replace_output_newlines: bool = True
+    def assemble_html(self, document, child_blocks, parent_structure):
+        if self.html:
+            return self.html
+        else:
+            return super().assemble_html(document, child_blocks, parent_structure)