Vik Paruchuri commited on
Commit
3453dd8
·
1 Parent(s): 818619c

Small bugfix

Browse files
README.md CHANGED
@@ -104,6 +104,7 @@ marker_single /path/to/file.pdf
104
  Options:
105
  - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
106
  - `--output_format [markdown|json|html]`: Specify the format for the output results.
 
107
  - `--use_llm`: Uses an LLM to improve accuracy. You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
108
  - `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
109
  - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
 
104
  Options:
105
  - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
106
  - `--output_format [markdown|json|html]`: Specify the format for the output results.
107
+ - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
108
  - `--use_llm`: Uses an LLM to improve accuracy. You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
109
  - `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
110
  - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
marker/processors/llm/llm_table_merge.py ADDED
File without changes
marker/processors/llm/utils.py CHANGED
@@ -35,7 +35,7 @@ class GoogleModel:
35
  while tries < max_retries:
36
  try:
37
  responses = self.model.generate_content(
38
- [prompt, image],
39
  stream=False,
40
  generation_config={
41
  "temperature": 0,
 
35
  while tries < max_retries:
36
  try:
37
  responses = self.model.generate_content(
38
+ [image, prompt], # According to gemini docs, it performs better if the image is the first element
39
  stream=False,
40
  generation_config={
41
  "temperature": 0,
marker/processors/sectionheader.py CHANGED
@@ -54,11 +54,8 @@ class SectionHeaderProcessor(BaseProcessor):
54
  heading_ranges = self.bucket_headings(flat_line_heights)
55
 
56
  for page in document.pages:
57
- for block in page.children:
58
- if block.block_type not in self.block_types:
59
- continue
60
-
61
- block_height = line_heights[block.id]
62
  if block_height > 0:
63
  for idx, (min_height, max_height) in enumerate(heading_ranges):
64
  if block_height >= min_height * self.height_tolerance:
 
54
  heading_ranges = self.bucket_headings(flat_line_heights)
55
 
56
  for page in document.pages:
57
+ for block in page.contained_blocks(document, self.block_types):
58
+ block_height = line_heights.get(block.id, 0)
 
 
 
59
  if block_height > 0:
60
  for idx, (min_height, max_height) in enumerate(heading_ranges):
61
  if block_height >= min_height * self.height_tolerance: