Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 22

Commit

a78718d

1 Parent(s): f712a74

Add gemini to table bench

Browse files

Files changed (3) hide show

README.md +1 -1
benchmarks/table/gemini.py +49 -0
benchmarks/table/table.py +56 -16

README.md CHANGED Viewed

@@ -400,7 +400,7 @@ Marker can extract tables from PDFs using `marker.converters.table.TableConverte
 | Avg score | Total tables | use_llm |
 |-----------|--------------|---------|
-| 0.82      | 54           | False   |
 | 0.887     | 54           | True    |
 The `--use_llm` flag can significantly improve table recognition performance, as you can see.

 | Avg score | Total tables | use_llm |
 |-----------|--------------|---------|
+| 0.822     | 54           | False   |
 | 0.887     | 54           | True    |
 The `--use_llm` flag can significantly improve table recognition performance, as you can see.

benchmarks/table/gemini.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import json
+from PIL import Image
+import google.generativeai as genai
+from google.ai.generativelanguage_v1beta.types import content
+from marker.settings import settings
+prompt = """
+You're an expert document analyst who is good at turning tables in documents into HTML.  Analyze the provided image, and convert it to a faithful HTML representation.
+Guidelines:
+- Keep the HTML simple and concise.
+- Only include the <table> tag and contents.
+- Only use <table>, <tr>, and <td> tags.  Only use the colspan and rowspan attributes if necessary.  Do not use <tbody>, <thead>, or <th> tags.
+- Make sure the table is as faithful to the image as possible with the given tags.
+**Instructions**
+1. Analyze the image, and determine the table structure.
+2. Convert the table image to HTML, following the guidelines above.
+3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag.
+""".strip()
+genai.configure(api_key=settings.GOOGLE_API_KEY)
+def gemini_table_rec(image: Image.Image):
+    schema = content.Schema(
+        type=content.Type.OBJECT,
+        required=["table_html"],
+        properties={
+            "table_html": content.Schema(
+                type=content.Type.STRING,
+            )
+        }
+    )
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    responses = model.generate_content(
+        [image, prompt],  # According to gemini docs, it performs better if the image is the first element
+        stream=False,
+        generation_config={
+            "temperature": 0,
+            "response_schema": schema,
+            "response_mime_type": "application/json",
+        },
+        request_options={'timeout': 60}
+    )
+    output = responses.candidates[0].content.parts[0].text
+    return json.loads(output)["table_html"]

benchmarks/table/table.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
-from typing import List
-import numpy as np
-from marker.renderers.json import JSONOutput, JSONBlockOutput
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
 import base64
 import time
 import datasets
@@ -16,21 +15,24 @@ import click
 from tabulate import tabulate
 import json
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
 from marker.util import matrix_intersection_area
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
 from scoring import wrap_table_html, similarity_eval_html
-def update_teds_score(result):
-    prediction, ground_truth = result['marker_table'], result['gt_table']
     prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
     score = similarity_eval_html(prediction, ground_truth)
-    result.update({'score':score})
     return result
@@ -51,7 +53,16 @@ def extract_tables(children: List[JSONBlockOutput]):
 @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
 @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
 @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
-def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm: bool, table_rec_batch_size: int | None):
     models = create_model_dict()
     config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
     start = time.time()
@@ -86,6 +97,9 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
                 marker_json = converter(temp_pdf_file.name).children
                 tqdm.disable = False
             if len(marker_json) == 0 or len(gt_tables) == 0:
                 print(f'No tables detected, skipping...')
                 total_unaligned += len(gt_tables)
@@ -94,6 +108,8 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
             marker_tables = extract_tables(marker_json)
             marker_table_boxes = [table.bbox for table in marker_tables]
             page_bbox = marker_json[0].bbox
             # Normalize the bboxes
             for bbox in marker_table_boxes:
@@ -136,14 +152,18 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
                     unaligned_tables.add(table_idx)
                     continue
                 aligned_tables.append(
-                    (marker_tables[aligned_idx], gt_tables[table_idx])
                 )
                 used_tables.add(aligned_idx)
             total_unaligned += len(unaligned_tables)
-            for marker_table, gt_table in aligned_tables:
                 gt_table_html = gt_table['html']
                 #marker wraps the table in <tbody> which fintabnet data doesn't
@@ -154,10 +174,12 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
                     th_tag.name = 'td'
                 marker_table_html = str(marker_table_soup)
                 marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
                 results.append({
                     "marker_table": marker_table_html,
-                    "gt_table": gt_table_html
                 })
         except PdfiumError:
             print('Broken PDF, Skipping...')
@@ -167,19 +189,37 @@ def main(out_file: str, dataset: str, max_rows: int, max_workers: int, use_llm:
     print(f"Could not align {total_unaligned} tables from fintabnet.")
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
-        results = list(
             tqdm(
                 executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
             )
         )
-    avg_score = sum([r["score"] for r in results]) / len(results)
     headers = ["Avg score", "Total tables"]
-    data = [f"{avg_score:.3f}", len(results)]
     table = tabulate([data], headers=headers, tablefmt="github")
     print(table)
     print("Avg score computed by comparing marker predicted HTML with original HTML")
     with open(out_file, "w+") as f:
         json.dump(results, f, indent=2)

 import os
+from itertools import repeat
+from tkinter import Image
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
+from typing import List
+import numpy as np
 import base64
 import time
 import datasets
 from tabulate import tabulate
 import json
 from bs4 import BeautifulSoup
+from concurrent.futures import ProcessPoolExecutor
 from pypdfium2._helpers.misc import PdfiumError
+import pypdfium2 as pdfium
 from marker.util import matrix_intersection_area
+from marker.renderers.json import JSONOutput, JSONBlockOutput
 from marker.config.parser import ConfigParser
 from marker.converters.table import TableConverter
 from marker.models import create_model_dict
 from scoring import wrap_table_html, similarity_eval_html
+from gemini import gemini_table_rec
+def update_teds_score(result, prefix: str = "marker"):
+    prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
     prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
     score = similarity_eval_html(prediction, ground_truth)
+    result.update({f'{prefix}_score':score})
     return result
 @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
 @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
 @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
+@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
+def main(
+        out_file: str,
+        dataset: str,
+        max_rows: int,
+        max_workers: int,
+        use_llm: bool,
+        table_rec_batch_size: int | None,
+        use_gemini: bool = False
+):
     models = create_model_dict()
     config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size})
     start = time.time()
                 marker_json = converter(temp_pdf_file.name).children
                 tqdm.disable = False
+                doc = pdfium.PdfDocument(temp_pdf_file.name)
+                page_image = doc[0].render(scale=92/72).to_pil()
             if len(marker_json) == 0 or len(gt_tables) == 0:
                 print(f'No tables detected, skipping...')
                 total_unaligned += len(gt_tables)
             marker_tables = extract_tables(marker_json)
             marker_table_boxes = [table.bbox for table in marker_tables]
             page_bbox = marker_json[0].bbox
+            w_scaler, h_scaler = page_image.width / page_bbox[2], page_image.height / page_bbox[3]
+            table_images = [page_image.crop([bbox[0] * w_scaler, bbox[1] * h_scaler, bbox[2] * w_scaler, bbox[3] * h_scaler]) for bbox in marker_table_boxes]
             # Normalize the bboxes
             for bbox in marker_table_boxes:
                     unaligned_tables.add(table_idx)
                     continue
+                gemini_html = ""
+                if use_gemini:
+                    gemini_html = gemini_table_rec(table_images[aligned_idx])
                 aligned_tables.append(
+                    (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
                 )
                 used_tables.add(aligned_idx)
             total_unaligned += len(unaligned_tables)
+            for marker_table, gt_table, gemini_table in aligned_tables:
                 gt_table_html = gt_table['html']
                 #marker wraps the table in <tbody> which fintabnet data doesn't
                     th_tag.name = 'td'
                 marker_table_html = str(marker_table_soup)
                 marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
+                gemini_table_html = gemini_table.replace("\n", " ") # Fintabnet uses spaces instead of newlines
                 results.append({
                     "marker_table": marker_table_html,
+                    "gt_table": gt_table_html,
+                    "gemini_table": gemini_table_html
                 })
         except PdfiumError:
             print('Broken PDF, Skipping...')
     print(f"Could not align {total_unaligned} tables from fintabnet.")
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        marker_results = list(
             tqdm(
                 executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
             )
         )
+    avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results)
     headers = ["Avg score", "Total tables"]
+    data = [f"{avg_score:.3f}", len(marker_results)]
+    gemini_results = None
+    if use_gemini:
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            gemini_results = list(
+                tqdm(
+                    executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores',
+                    total=len(results)
+                )
+            )
+        avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results)
+        headers.append("Avg Gemini score")
+        data.append(f"{avg_gemini_score:.3f}")
     table = tabulate([data], headers=headers, tablefmt="github")
     print(table)
     print("Avg score computed by comparing marker predicted HTML with original HTML")
+    results = {
+        "marker": marker_results,
+        "gemini": gemini_results
+    }
     with open(out_file, "w+") as f:
         json.dump(results, f, indent=2)