Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Moses Paul R commited on Jan 23

Commit

ca8a504

2 Parent(s): 69b4d9a 6bd5629

Merge remote-tracking branch 'origin/dev' into dev-mose/input-formats-2

Browse files

Files changed (22) hide show

README.md +8 -3
benchmarks/table/gemini.py +49 -0
benchmarks/table/table.py +56 -16
chunk_convert.py +1 -1
convert.py +1 -1
convert_single.py +1 -1
marker/builders/ocr.py +6 -2
marker/processors/llm/llm_form.py +1 -1
marker/processors/llm/llm_table.py +6 -4
marker/processors/llm/llm_table_merge.py +1 -1
marker/processors/table.py +22 -7
marker/schema/blocks/base.py +2 -1
marker/scripts/__init__.py +0 -5
marker/scripts/convert.py +1 -1
marker/scripts/server.py +1 -1
marker/scripts/streamlit_app.py +6 -10
marker_app.py +1 -1
marker_server.py +1 -1
poetry.lock +0 -0
pyproject.toml +3 -3
signatures/version1/cla.json +32 -0
tests/builders/test_garbled_pdf.py +12 -4

README.md CHANGED Viewed

@@ -219,7 +219,12 @@ rendered = converter("FILEPATH")
 text, _, images = text_from_rendered(rendered)
 ```
-This takes all the same configuration as the PdfConverter.  You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
 # Output Formats
@@ -400,8 +405,8 @@ Marker can extract tables from PDFs using `marker.converters.table.TableConverte
 | Avg score | Total tables | use_llm |
 |-----------|--------------|---------|
-| 0.824     | 54           | False   |
-| 0.873     | 54           | True    |
 The `--use_llm` flag can significantly improve table recognition performance, as you can see.

 text, _, images = text_from_rendered(rendered)
 ```
+This takes all the same configuration as the PdfConverter.  You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
+You can also run this via the CLI with
+```shell
+python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter
+```
 # Output Formats
 | Avg score | Total tables | use_llm |
 |-----------|--------------|---------|
+| 0.822     | 54           | False   |
+| 0.887     | 54           | True    |
 The `--use_llm` flag can significantly improve table recognition performance, as you can see.

benchmarks/table/gemini.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import json
+from PIL import Image
+import google.generativeai as genai
+from google.ai.generativelanguage_v1beta.types import content
+from marker.settings import settings
+prompt = """
+You're an expert document analyst who is good at turning tables in documents into HTML.  Analyze the provided image, and convert it to a faithful HTML representation.
+Guidelines:
+- Keep the HTML simple and concise.
+- Only include the <table> tag and contents.
+- Only use <table>, <tr>, and <td> tags.  Only use the colspan and rowspan attributes if necessary.  Do not use <tbody>, <thead>, or <th> tags.
+- Make sure the table is as faithful to the image as possible with the given tags.
+**Instructions**
+1. Analyze the image, and determine the table structure.
+2. Convert the table image to HTML, following the guidelines above.
+3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag.
+""".strip()
+genai.configure(api_key=settings.GOOGLE_API_KEY)
+def gemini_table_rec(image: Image.Image):
+    schema = content.Schema(
+        type=content.Type.OBJECT,
+        required=["table_html"],
+        properties={
+            "table_html": content.Schema(
+                type=content.Type.STRING,
+            )
+        }
+    )
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    responses = model.generate_content(
+        [image, prompt],  # According to gemini docs, it performs better if the image is the first element
+        stream=False,
+        generation_config={
+            "temperature": 0,
+            "response_schema": schema,
+            "response_mime_type": "application/json",
+        },
+        request_options={'timeout': 60}
+    )
+    output = responses.candidates[0].content.parts[0].text
+    return json.loads(output)["table_html"]

benchmarks/table/table.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
-from typing import List
-import numpy as np
-from marker.renderers.json import JSONOutput, JSONBlockOutput
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
 import base64
 import time
 import datasets
@@ -16,21 +15,24 @@ import click
 from tabulate import tabulate
 import json
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
 from marker.util import matrix_intersection_area
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
 from scoring import wrap_table_html, similarity_eval_html
-def update_teds_score(result):
-    prediction, ground_truth = result['marker_table'], result['gt_table']
     prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
     score = similarity_eval_html(prediction, ground_truth)
-    result.update({'score':score})
     return result
@@ -51,7 +53,16 @@ def extract_tables(children: List[JSONBlockOutput]):
 @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
 @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
 @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
-def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm: bool, table_rec_batch_size: int | None):
     models = create_model_dict()
     config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
     start = time.time()
@@ -86,6 +97,9 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
                 marker_json = converter(temp_pdf_file.name).children
                 tqdm.disable = False
             if len(marker_json) == 0 or len(gt_tables) == 0:
                 print(f'No tables detected, skipping...')
                 total_unaligned += len(gt_tables)
@@ -94,6 +108,8 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
             marker_tables = extract_tables(marker_json)
             marker_table_boxes = [table.bbox for table in marker_tables]
             page_bbox = marker_json[0].bbox
             # Normalize the bboxes
             for bbox in marker_table_boxes:
@@ -136,14 +152,18 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
                     unaligned_tables.add(table_idx)
                     continue
                 aligned_tables.append(
-                    (marker_tables[aligned_idx], gt_tables[table_idx])
                 )
                 used_tables.add(aligned_idx)
             total_unaligned += len(unaligned_tables)
-            for marker_table, gt_table in aligned_tables:
                 gt_table_html = gt_table['html']
                 #marker wraps the table in <tbody> which fintabnet data doesn't
@@ -154,10 +174,12 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
                     th_tag.name = 'td'
                 marker_table_html = str(marker_table_soup)
                 marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
                 results.append({
                     "marker_table": marker_table_html,
-                    "gt_table": gt_table_html
                 })
         except PdfiumError:
             print('Broken PDF, Skipping...')
@@ -167,19 +189,37 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
     print(f"Could not align {total_unaligned} tables from fintabnet.")
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
-        results = list(
             tqdm(
                 executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
             )
         )
-    avg_score = sum([r["score"] for r in results]) / len(results)
     headers = ["Avg score", "Total tables"]
-    data = [f"{avg_score:.3f}", len(results)]
     table = tabulate([data], headers=headers, tablefmt="github")
     print(table)
     print("Avg score computed by comparing marker predicted HTML with original HTML")
     with open(out_file, "w+") as f:
         json.dump(results, f, indent=2)

 import os
+from itertools import repeat
+from tkinter import Image
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+from typing import List
+import numpy as np
 import base64
 import time
 import datasets
 from tabulate import tabulate
 import json
 from bs4 import BeautifulSoup
+from concurrent.futures import ProcessPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
+import pypdfium2 as pdfium
 from marker.util import matrix_intersection_area
+from marker.renderers.json import JSONOutput, JSONBlockOutput
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
 from scoring import wrap_table_html, similarity_eval_html
+from gemini import gemini_table_rec
+def update_teds_score(result, prefix: str = "marker"):
+    prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
     prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
     score = similarity_eval_html(prediction, ground_truth)
+    result.update({f'{prefix}_score':score})
     return result
 @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
 @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
 @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
+@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
+def main(
+        out_file: str,
+        dataset: str,
+        max_rows: int,
+        max_workers: int,
+        use_llm: bool,
+        table_rec_batch_size: int | None,
+        use_gemini: bool = False
+):
     models = create_model_dict()
     config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
     start = time.time()
                 marker_json = converter(temp_pdf_file.name).children
                 tqdm.disable = False
+                doc = pdfium.PdfDocument(temp_pdf_file.name)
+                page_image = doc[0].render(scale=92/72).to_pil()
             if len(marker_json) == 0 or len(gt_tables) == 0:
                 print(f'No tables detected, skipping...')
                 total_unaligned += len(gt_tables)
             marker_tables = extract_tables(marker_json)
             marker_table_boxes = [table.bbox for table in marker_tables]
             page_bbox = marker_json[0].bbox
+            w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
+            table_images = [page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox in marker_table_boxes]
             # Normalize the bboxes
             for bbox in marker_table_boxes:
                     unaligned_tables.add(table_idx)
                     continue
+                gemini_html = ""
+                if use_gemini:
+                    gemini_html = gemini_table_rec(table_images[aligned_idx])
                 aligned_tables.append(
+                    (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
                 )
                 used_tables.add(aligned_idx)
             total_unaligned += len(unaligned_tables)
+            for marker_table, gt_table, gemini_table in aligned_tables:
                 gt_table_html = gt_table['html']
                 #marker wraps the table in <tbody> which fintabnet data doesn't
                     th_tag.name = 'td'
                 marker_table_html = str(marker_table_soup)
                 marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
+                gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines
                 results.append({
                     "marker_table": marker_table_html,
+                    "gt_table": gt_table_html,
+                    "gemini_table": gemini_table_html
                 })
         except PdfiumError:
             print('Broken PDF, Skipping...')
     print(f"Could not align {total_unaligned} tables from fintabnet.")
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        marker_results = list(
             tqdm(
                 executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
             )
         )
+    avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results)
     headers = ["Avg score", "Total tables"]
+    data = [f"{avg_score:.3f}", len(marker_results)]
+    gemini_results = None
+    if use_gemini:
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            gemini_results = list(
+                tqdm(
+                    executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores',
+                    total=len(results)
+                )
+            )
+        avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results)
+        headers.append("Avg Gemini score")
+        data.append(f"{avg_gemini_score:.3f}")
     table = tabulate([data], headers=headers, tablefmt="github")
     print(table)
     print("Avg score computed by comparing marker predicted HTML with original HTML")
+    results = {
+        "marker": marker_results,
+        "gemini": gemini_results
+    }
     with open(out_file, "w+") as f:
         json.dump(results, f, indent=2)

chunk_convert.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from marker.scripts import chunk_convert_cli
 if __name__ == "__main__":
     chunk_convert_cli()

+from marker.scripts.chunk_convert import chunk_convert_cli
 if __name__ == "__main__":
     chunk_convert_cli()

convert.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from marker.scripts import convert_cli
 if __name__ == "__main__":
     convert_cli()

+from marker.scripts.convert import convert_cli
 if __name__ == "__main__":
     convert_cli()

convert_single.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from marker.scripts import convert_single_cli
 if __name__ == "__main__":
     convert_single_cli()

+from marker.scripts.convert_single import convert_single_cli
 if __name__ == "__main__":
     convert_single_cli()

marker/builders/ocr.py CHANGED Viewed

@@ -35,6 +35,10 @@ class OcrBuilder(BaseBuilder):
         "A list of languages to use for OCR.",
         "Default is None."
     ] = None
     def __init__(self, detection_model: DetectionPredictor, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
@@ -67,12 +71,12 @@ class OcrBuilder(BaseBuilder):
         # Remove tables because we re-OCR them later with the table processor
         recognition_results = self.recognition_model(
-            images=[page.get_image(highres=False, remove_tables=True) for page in page_list],
             langs=[self.languages] * len(page_list),
             det_predictor=self.detection_model,
             detection_batch_size=int(self.get_detection_batch_size()),
             recognition_batch_size=int(self.get_recognition_batch_size()),
-            highres_images=[page.get_image(highres=True, remove_tables=True) for page in page_list]
         )
         page_lines = {}

         "A list of languages to use for OCR.",
         "Default is None."
     ] = None
+    enable_table_ocr: Annotated[
+        bool,
+        "Whether to skip OCR on tables.  The TableProcessor will re-OCR them.  Only enable if the TableProcessor is not running.",
+    ] = False
     def __init__(self, detection_model: DetectionPredictor, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
         # Remove tables because we re-OCR them later with the table processor
         recognition_results = self.recognition_model(
+            images=[page.get_image(highres=False, remove_tables=not self.enable_table_ocr) for page in page_list],
             langs=[self.languages] * len(page_list),
             det_predictor=self.detection_model,
             detection_batch_size=int(self.get_detection_batch_size()),
             recognition_batch_size=int(self.get_recognition_batch_size()),
+            highres_images=[page.get_image(highres=True, remove_tables=not self.enable_table_ocr) for page in page_list]
         )
         page_lines = {}

marker/processors/llm/llm_form.py CHANGED Viewed

@@ -17,7 +17,7 @@ Values and labels should appear in html tables, with the labels on the left side
 **Instructions:**
 1. Carefully examine the provided form block image.
 2. Analyze the html representation of the form.
-3. If the html representation is largely correct, then write "No corrections needed."
 4. If the html representation contains errors, generate the corrected html representation.
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**

 **Instructions:**
 1. Carefully examine the provided form block image.
 2. Analyze the html representation of the form.
+3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
 4. If the html representation contains errors, generate the corrected html representation.
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**

marker/processors/llm/llm_table.py CHANGED Viewed

@@ -16,9 +16,9 @@ class LLMTableProcessor(BaseLLMProcessor):
         Tuple[BlockTypes],
         "The block types to process.",
     ] = (BlockTypes.Table, BlockTypes.TableOfContents)
-    max_row_count: Annotated[
         int,
-        "If the table has more rows than this, don't run LLM processor. (LLMs can be inaccurate with a lot of rows)",
     ] = 75
     table_rewriting_prompt: Annotated[
         str,
@@ -37,7 +37,7 @@ Some guidelines:
 **Instructions:**
 1. Carefully examine the provided text block image.
 2. Analyze the html representation of the table.
-3. If the html representation is largely correct, then write "No corrections needed."
 4. If the html representation contains errors, generate the corrected html representation.
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**
@@ -74,7 +74,9 @@ No corrections needed.
         # LLMs don't handle tables with a lot of rows very well
         row_count = len(set([cell.row_id for cell in children]))
-        if row_count > self.max_row_count:
             return
         block_html = block.render(document).html

         Tuple[BlockTypes],
         "The block types to process.",
     ] = (BlockTypes.Table, BlockTypes.TableOfContents)
+    max_rows_per_batch: Annotated[
         int,
+        "If the table has more rows than this, chunk the table. (LLMs can be inaccurate with a lot of rows)",
     ] = 75
     table_rewriting_prompt: Annotated[
         str,
 **Instructions:**
 1. Carefully examine the provided text block image.
 2. Analyze the html representation of the table.
+3. If the html representation is largely correct, or you cannot read the image properly, then write "No corrections needed."
 4. If the html representation contains errors, generate the corrected html representation.
 5. Output only either the corrected html representation or "No corrections needed."
 **Example:**
         # LLMs don't handle tables with a lot of rows very well
         row_count = len(set([cell.row_id for cell in children]))
+        # TODO: eventually chunk the table and inference each chunk
+        if row_count > self.max_rows_per_batch:
             return
         block_html = block.render(document).html

marker/processors/llm/llm_table_merge.py CHANGED Viewed

@@ -55,7 +55,7 @@ You'll specify your judgement in json format - first whether Table 2 should be m
 Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
-Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging.
 **Instructions:**
 1. Carefully examine the provided table images.  Table 1 is the first image, and Table 2 is the second image.

 Table 2 should be merged at the bottom of Table 1 if Table 2 has no headers, and the rows have similar values, meaning that Table 2 continues Table 1. Table 2 should be merged to the right of Table 1 if each row in Table 2 matches a row in Table 1, meaning that Table 2 contains additional columns that augment Table 1.
+Only merge Table 1 and Table 2 if Table 2 cannot be interpreted without merging.  Only merge Table 1 and Table 2 if you can read both images properly.
 **Instructions:**
 1. Carefully examine the provided table images.  Table 1 is the first image, and Table 2 is the second image.

marker/processors/table.py CHANGED Viewed

@@ -2,6 +2,8 @@ import re
 from collections import defaultdict
 from copy import deepcopy
 from typing import Annotated, List
 from ftfy import fix_text
 from surya.detection import DetectionPredictor
@@ -67,7 +69,7 @@ class TableProcessor(BaseProcessor):
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
-                image = block.get_image(document, highres=True, expansion=(.01, .01))
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
                 table_data.append({
@@ -165,22 +167,35 @@ class TableProcessor(BaseProcessor):
                 # Other cells that span into this row
                 rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
-                should_split = all([
-                    len(row_cells) > 0,
                     len(rowspan_cells) == 0,
                     all([r == 1 for r in rowspans]),
                     all([l > 1 for l in line_lens]),
                     all([l == line_lens[0] for l in line_lens])
                 ])
                 if should_split:
-                    for i in range(0, line_lens[0]):
                         for cell in row_cells:
-                            line = cell.text_lines[i]
                             cell_id = max_cell_id + new_cell_count
                             new_cells.append(
                                 SuryaTableCell(
-                                    polygon=line["bbox"],
-                                    text_lines=[line],
                                     rowspan=1,
                                     colspan=cell.colspan,
                                     row_id=cell.row_id + shift_up + i,

 from collections import defaultdict
 from copy import deepcopy
 from typing import Annotated, List
+from collections import Counter
+from PIL import ImageDraw
 from ftfy import fix_text
 from surya.detection import DetectionPredictor
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
+                image = block.get_image(document, highres=True)
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.get_image(highres=True).size)
                 table_data.append({
                 # Other cells that span into this row
                 rowspan_cells = [c for c in table.cells if c.row_id != row and c.row_id + c.rowspan > row > c.row_id]
+                should_split_entire_row = all([
+                    len(row_cells) > 1,
                     len(rowspan_cells) == 0,
                     all([r == 1 for r in rowspans]),
                     all([l > 1 for l in line_lens]),
                     all([l == line_lens[0] for l in line_lens])
                 ])
+                line_lens_counter = Counter(line_lens)
+                counter_keys = sorted(list(line_lens_counter.keys()))
+                should_split_partial_row = all([
+                    len(row_cells) > 3, # Only split if there are more than 3 cells
+                    len(rowspan_cells) == 0,
+                    all([r == 1 for r in rowspans]),
+                    len(line_lens_counter) == 2 and counter_keys[0] <= 1 and counter_keys[1] > 1 and line_lens_counter[counter_keys[0]] == 1, # Allow a single column with a single line - keys are the line lens, values are the counts
+                ])
+                should_split = should_split_entire_row or should_split_partial_row
                 if should_split:
+                    for i in range(0, max(line_lens)):
                         for cell in row_cells:
+                            # Calculate height based on number of splits
+                            split_height = cell.bbox[3] - cell.bbox[1]
+                            current_bbox = [cell.bbox[0], cell.bbox[1] + i * split_height, cell.bbox[2], cell.bbox[1] + (i + 1) * split_height]
+                            line = [cell.text_lines[i]] if cell.text_lines and i < len(cell.text_lines) else None
                             cell_id = max_cell_id + new_cell_count
                             new_cells.append(
                                 SuryaTableCell(
+                                    polygon=current_bbox,
+                                    text_lines=line,
                                     rowspan=1,
                                     colspan=cell.colspan,
                                     row_id=cell.row_id + shift_up + i,

marker/schema/blocks/base.py CHANGED Viewed

@@ -167,9 +167,10 @@ class Block(BaseModel):
     def raw_text(self, document: Document) -> str:
         from marker.schema.text.line import Line
         from marker.schema.text.span import Span
         if self.structure is None:
-            if isinstance(self, Span):
                 return self.text
             else:
                 return ""

     def raw_text(self, document: Document) -> str:
         from marker.schema.text.line import Line
         from marker.schema.text.span import Span
+        from marker.schema.blocks.tablecell import TableCell
         if self.structure is None:
+            if isinstance(self, (Span, TableCell)):
                 return self.text
             else:
                 return ""

marker/scripts/__init__.py CHANGED Viewed

@@ -1,5 +0,0 @@
-from marker.scripts.convert_single import convert_single_cli
-from marker.scripts.convert import convert_cli
-from marker.scripts.server import server_cli
-from marker.scripts.run_streamlit_app import streamlit_app_cli
-from marker.scripts.chunk_convert import chunk_convert_cli

marker/scripts/convert.py CHANGED Viewed

@@ -100,7 +100,7 @@ def convert_cli(in_folder: str, **kwargs):
     else:
         model_dict = create_model_dict()
         for k, v in model_dict.items():
-            v.share_memory()
     print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
     task_args = [(f, kwargs) for f in files_to_convert]

     else:
         model_dict = create_model_dict()
         for k, v in model_dict.items():
+            v.model.share_memory()
     print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
     task_args = [(f, kwargs) for f in files_to_convert]

marker/scripts/server.py CHANGED Viewed

@@ -3,7 +3,6 @@ import traceback
 import click
 import os
-import uvicorn
 from pydantic import BaseModel, Field
 from starlette.responses import HTMLResponse
@@ -163,6 +162,7 @@ async def convert_pdf_upload(
 @click.option("--port", type=int, default=8000, help="Port to run the server on")
 @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
 def server_cli(port: int, host: str):
     # Run the server
     uvicorn.run(
         app,

 import click
 import os
 from pydantic import BaseModel, Field
 from starlette.responses import HTMLResponse
 @click.option("--port", type=int, default=8000, help="Port to run the server on")
 @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
 def server_cli(port: int, host: str):
+    import uvicorn
     # Run the server
     uvicorn.run(
         app,

marker/scripts/streamlit_app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 from marker.settings import settings
 from streamlit.runtime.uploaded_file_manager import UploadedFile
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
-os.environ["IN_STREAMLIT"] = "true"
 import base64
 import io
 import re
@@ -69,15 +68,12 @@ def markdown_insert_images(markdown, images):
 def get_page_image(pdf_file, page_num, dpi=96):
     if "pdf" in pdf_file.type:
         doc = open_pdf(pdf_file)
-        renderer = doc.render(
-            pypdfium2.PdfBitmap.to_pil,
-            page_indices=[page_num],
             scale=dpi / 72,
-        )
-        png = list(renderer)[0]
-        png_image = png.convert("RGB")
     else:
-        png_image = Image.open(in_file).convert("RGB")
     return png_image

 import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["IN_STREAMLIT"] = "true"
 from marker.settings import settings
 from streamlit.runtime.uploaded_file_manager import UploadedFile
 import base64
 import io
 import re
 def get_page_image(pdf_file, page_num, dpi=96):
     if "pdf" in pdf_file.type:
         doc = open_pdf(pdf_file)
+        page = doc[page_num]
+        png_image = page.render(
             scale=dpi / 72,
+        ).to_pil().convert("RGB")
     else:
+        png_image = Image.open(pdf_file).convert("RGB")
     return png_image

marker_app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from marker.scripts import streamlit_app_cli
 if __name__ == "__main__":
     streamlit_app_cli()

+from marker.scripts.run_streamlit_app import streamlit_app_cli
 if __name__ == "__main__":
     streamlit_app_cli()

marker_server.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from marker.scripts import server_cli
 if __name__ == "__main__":
     server_cli()

+from marker.scripts.server import server_cli
 if __name__ == "__main__":
     server_cli()

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -23,13 +23,12 @@ transformers = "^4.45.2"
 python-dotenv = "^1.0.0"
 torch = "^2.5.1"
 tqdm = "^4.66.1"
-tabulate = "^0.9.0"
 ftfy = "^6.1.1"
 texify = "^0.2.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "~0.8.3"
 regex = "^2024.4.28"
-pdftext = "~0.4.1"
 markdownify = "^0.13.1"
 click = "^8.1.7"
 google-generativeai = "^0.8.3"
@@ -53,6 +52,7 @@ pytest-mock = "^3.14.0"
 apted = "1.0.3"
 distance = "0.1.3"
 lxml = "5.3.0"
 [tool.poetry.scripts]
 marker = "marker.scripts.convert:convert_cli"

 python-dotenv = "^1.0.0"
 torch = "^2.5.1"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 texify = "^0.2.1"
 rapidfuzz = "^3.8.1"
+surya-ocr = "~0.9.0"
 regex = "^2024.4.28"
+pdftext = "~0.5.0"
 markdownify = "^0.13.1"
 click = "^8.1.7"
 google-generativeai = "^0.8.3"
 apted = "1.0.3"
 distance = "0.1.3"
 lxml = "5.3.0"
+tabulate = "^0.9.0"
 [tool.poetry.scripts]
 marker = "marker.scripts.convert:convert_cli"

signatures/version1/cla.json CHANGED Viewed

@@ -111,6 +111,38 @@
       "created_at": "2024-12-05T13:13:34Z",
       "repoId": 712111618,
       "pullRequestNo": 416
     }
   ]
 }

       "created_at": "2024-12-05T13:13:34Z",
       "repoId": 712111618,
       "pullRequestNo": 416
+    },
+    {
+      "name": "tarun-menta",
+      "id": 66506307,
+      "comment_id": 2543907406,
+      "created_at": "2024-12-15T15:06:32Z",
+      "repoId": 712111618,
+      "pullRequestNo": 427
+    },
+    {
+      "name": "ZeyuTeng96",
+      "id": 96521059,
+      "comment_id": 2567236036,
+      "created_at": "2025-01-02T02:36:02Z",
+      "repoId": 712111618,
+      "pullRequestNo": 452
+    },
+    {
+      "name": "xiaoyao9184",
+      "id": 6614349,
+      "comment_id": 2571623521,
+      "created_at": "2025-01-05T13:15:34Z",
+      "repoId": 712111618,
+      "pullRequestNo": 463
+    },
+    {
+      "name": "yasyf",
+      "id": 709645,
+      "comment_id": 2571679069,
+      "created_at": "2025-01-05T16:23:12Z",
+      "repoId": 712111618,
+      "pullRequestNo": 464
     }
   ]
 }

tests/builders/test_garbled_pdf.py CHANGED Viewed

@@ -2,10 +2,11 @@ import pytest
 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.schema import BlockTypes
 @pytest.mark.filename("water_damage.pdf")
-def test_garbled_pdf(pdf_document):
     assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
     table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
@@ -16,9 +17,16 @@ def test_garbled_pdf(pdf_document):
     assert table_cell.block_type == BlockTypes.Line
     assert table_cell.structure[0] == "/page/0/Span/2"
-    span = pdf_document.pages[0].get_block(table_cell.structure[0])
     assert span.block_type == BlockTypes.Span
-    assert "комплекс" in span.text
 @pytest.mark.filename("hindi_judgement.pdf")
@@ -30,7 +38,7 @@ def test_garbled_builder(config, pdf_provider, layout_model, ocr_error_model):
     bad_ocr_results = layout_builder.surya_ocr_error_detection(document.pages, pdf_provider.page_lines)
     assert len(bad_ocr_results.labels) == 2
-    assert all([l == "bad" for l in bad_ocr_results.labels])
 @pytest.mark.filename("adversarial.pdf")

 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
+from marker.processors.table import TableProcessor
 from marker.schema import BlockTypes
 @pytest.mark.filename("water_damage.pdf")
+def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec_model):
     assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
     table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
     assert table_cell.block_type == BlockTypes.Line
     assert table_cell.structure[0] == "/page/0/Span/2"
+    span = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Span,))[0]
     assert span.block_type == BlockTypes.Span
+    assert len(span.text.strip()) == 0
+    # We don't OCR in the initial pass, only with the TableProcessor
+    processor = TableProcessor(detection_model, recognition_model, table_rec_model)
+    processor(pdf_document)
+    table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
+    assert "варіант" in table.raw_text(pdf_document)
 @pytest.mark.filename("hindi_judgement.pdf")
     bad_ocr_results = layout_builder.surya_ocr_error_detection(document.pages, pdf_provider.page_lines)
     assert len(bad_ocr_results.labels) == 2
+    assert any([l == "bad" for l in bad_ocr_results.labels])
 @pytest.mark.filename("adversarial.pdf")