Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Feb 13

Commit

65d4b3a

1 Parent(s): 7fd45b1

Update examples, OCR fix

Browse files

Files changed (10) hide show

README.md +1 -1
marker/builders/llm_layout.py +1 -11
marker/builders/ocr.py +1 -1
marker/processors/blockquote.py +1 -1
marker/processors/llm/__init__.py +0 -8
marker/services/__init__.py +12 -3
marker/services/gemini.py +10 -4
marker/services/ollama.py +2 -3
marker/services/vertex.py +1 -1
pyproject.toml +1 -1

README.md CHANGED Viewed

@@ -22,7 +22,7 @@ See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instruc
 ## Hybrid Mode
-For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker.  This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms.  It can use any Google model (`gemini-2.0-flash` by default), or any ollama model.  See [below](#llm-services) for details.
 Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:

 ## Hybrid Mode
+For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker.  This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms.  It can use any gemini or ollama model.  By default, it uses `gemini-2.0-flash`.  See [below](#llm-services) for details.
 Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:

marker/builders/llm_layout.py CHANGED Viewed

@@ -37,18 +37,10 @@ class LLMLayoutBuilder(LayoutBuilder):
         str,
         "The name of the Gemini model to use.",
     ] = "gemini-2.0-flash"
-    max_retries: Annotated[
-        int,
-        "The maximum number of retries to use for the Gemini model.",
-    ] = 2
     max_concurrency: Annotated[
         int,
         "The maximum number of concurrent requests to make to the Gemini model.",
     ] = 3
-    timeout: Annotated[
-        int,
-        "The timeout for requests to the Gemini model.",
-    ] = 60
     disable_tqdm: Annotated[
         bool,
         "Whether to disable the tqdm progress bar.",
@@ -162,9 +154,7 @@ Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form
             prompt,
             image,
             block,
-            LayoutSchema,
-            max_retries=self.max_retries,
-            timeout=self.timeout
         )
         generated_label = None
         if response and "label" in response:

         str,
         "The name of the Gemini model to use.",
     ] = "gemini-2.0-flash"
     max_concurrency: Annotated[
         int,
         "The maximum number of concurrent requests to make to the Gemini model.",
     ] = 3
     disable_tqdm: Annotated[
         bool,
         "Whether to disable the tqdm progress bar.",
             prompt,
             image,
             block,
+            LayoutSchema
         )
         generated_label = None
         if response and "label" in response:

marker/builders/ocr.py CHANGED Viewed

@@ -81,7 +81,7 @@ class OcrBuilder(BaseBuilder):
         recognition_results = self.recognition_model(
             images=images,
             bboxes=line_boxes,
-            langs=[self.languages] * len(document.pages),
             recognition_batch_size=int(self.get_recognition_batch_size()),
             sort_lines=False
         )

         recognition_results = self.recognition_model(
             images=images,
             bboxes=line_boxes,
+            langs=[self.languages] * len(pages),
             recognition_batch_size=int(self.get_recognition_batch_size()),
             sort_lines=False
         )

marker/processors/blockquote.py CHANGED Viewed

@@ -17,7 +17,7 @@ class BlockquoteProcessor(BaseProcessor):
         float,
         "The minimum horizontal indentation required to consider a block as part of a blockquote.",
         "Expressed as a percentage of the block width.",
-    ] = 0.05
     x_start_tolerance: Annotated[
         float,
         "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",

         float,
         "The minimum horizontal indentation required to consider a block as part of a blockquote.",
         "Expressed as a percentage of the block width.",
+    ] = 0.1
     x_start_tolerance: Annotated[
         float,
         "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",

marker/processors/llm/__init__.py CHANGED Viewed

@@ -42,18 +42,10 @@ class BaseLLMProcessor(BaseProcessor):
         str,
         "The name of the Gemini model to use.",
     ] = "gemini-2.0-flash"
-    max_retries: Annotated[
-        int,
-        "The maximum number of retries to use for the Gemini model.",
-    ] = 1
     max_concurrency: Annotated[
         int,
         "The maximum number of concurrent requests to make to the Gemini model.",
     ] = 3
-    timeout: Annotated[
-        int,
-        "The timeout for requests to the Gemini model.",
-    ] = 20
     image_expansion_ratio: Annotated[
         float,
         "The ratio to expand the image by when cropping.",

         str,
         "The name of the Gemini model to use.",
     ] = "gemini-2.0-flash"
     max_concurrency: Annotated[
         int,
         "The maximum number of concurrent requests to make to the Gemini model.",
     ] = 3
     image_expansion_ratio: Annotated[
         float,
         "The ratio to expand the image by when cropping.",

marker/services/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, List
 import PIL
 from pydantic import BaseModel
@@ -8,6 +8,15 @@ from marker.util import assign_config, verify_config_keys
 class BaseService:
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
@@ -20,7 +29,7 @@ class BaseService:
         image: PIL.Image.Image | List[PIL.Image.Image],
         block: Block,
         response_schema: type[BaseModel],
-        max_retries: int = 1,
-        timeout: int = 15
      ):
         raise NotImplementedError

+from typing import Optional, List, Annotated
 import PIL
 from pydantic import BaseModel
 class BaseService:
+    timeout: Annotated[
+        int,
+        "The timeout to use for the service."
+    ] = 15
+    max_retries: Annotated[
+        int,
+        "The maximum number of retries to use for the service."
+    ] = 1
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
         image: PIL.Image.Image | List[PIL.Image.Image],
         block: Block,
         response_schema: type[BaseModel],
+        max_retries: int | None = None,
+        timeout: int | None = None
      ):
         raise NotImplementedError

marker/services/gemini.py CHANGED Viewed

@@ -23,7 +23,7 @@ class BaseGeminiService(BaseService):
         img.save(image_bytes, format="PNG")
         return image_bytes.getvalue()
-    def get_google_client(self, timeout: int = 60):
         raise NotImplementedError
     def __call__(
@@ -32,9 +32,15 @@ class BaseGeminiService(BaseService):
             image: PIL.Image.Image | List[PIL.Image.Image],
             block: Block,
             response_schema: type[BaseModel],
-            max_retries: int = 1,
-            timeout: int = 15
     ):
         if not isinstance(image, list):
             image = [image]
@@ -80,7 +86,7 @@ class GoogleGeminiService(BaseGeminiService):
         "The Google API key to use for the service."
     ] = None
-    def get_google_client(self, timeout: int = 60):
         return genai.Client(
             api_key=self.gemini_api_key,
             http_options={"timeout": timeout * 1000} # Convert to milliseconds

         img.save(image_bytes, format="PNG")
         return image_bytes.getvalue()
+    def get_google_client(self, timeout: int):
         raise NotImplementedError
     def __call__(
             image: PIL.Image.Image | List[PIL.Image.Image],
             block: Block,
             response_schema: type[BaseModel],
+            max_retries: int | None = None,
+            timeout: int | None = None
     ):
+        if max_retries is None:
+            max_retries = self.max_retries
+        if timeout is None:
+            timeout = self.timeout
         if not isinstance(image, list):
             image = [image]
         "The Google API key to use for the service."
     ] = None
+    def get_google_client(self, timeout: int):
         return genai.Client(
             api_key=self.gemini_api_key,
             http_options={"timeout": timeout * 1000} # Convert to milliseconds

marker/services/ollama.py CHANGED Viewed

@@ -32,8 +32,8 @@ class OllamaService(BaseService):
         image: PIL.Image.Image | List[PIL.Image.Image],
         block: Block,
         response_schema: type[BaseModel],
-        max_retries: int = 1,
-        timeout: int = 15
     ):
         url = f"{self.ollama_base_url}/api/generate"
         headers = {"Content-Type": "application/json"}
@@ -63,7 +63,6 @@ class OllamaService(BaseService):
             response.raise_for_status()
             response_data = response.json()
             data = response_data["response"]
-            print(data)
             return json.loads(data)
         except Exception as e:
             print(f"Ollama inference failed: {e}")

         image: PIL.Image.Image | List[PIL.Image.Image],
         block: Block,
         response_schema: type[BaseModel],
+        max_retries: int | None = None,
+        timeout: int | None = None
     ):
         url = f"{self.ollama_base_url}/api/generate"
         headers = {"Content-Type": "application/json"}
             response.raise_for_status()
             response_data = response.json()
             data = response_data["response"]
             return json.loads(data)
         except Exception as e:
             print(f"Ollama inference failed: {e}")

marker/services/vertex.py CHANGED Viewed

@@ -18,7 +18,7 @@ class GoogleVertexService(BaseGeminiService):
         "The name of the Google model to use for the service."
     ] = "gemini-1.5-flash-002"
-    def get_google_client(self, timeout: int = 60):
         return genai.Client(
             vertexai=True,
             project=self.vertex_project_id,

         "The name of the Google model to use for the service."
     ] = "gemini-1.5-flash-002"
+    def get_google_client(self, timeout: int):
         return genai.Client(
             vertexai=True,
             project=self.vertex_project_id,

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.4.0"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"

 [tool.poetry]
 name = "marker-pdf"
+version = "1.5.0"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"