File size: 1,656 Bytes
1a755c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from gmft.formatters.base import FormattedTable
from gmft.formatters.page.base import FormattedPage
from gmft.pdf_bindings.base import BasePage

from gmft.auto import AutoTableFormatter, AutoTableDetector, TATRFormatConfig
from gmft.pdf_bindings import PyPDFium2Document

detector = AutoTableDetector()
formatter = AutoTableFormatter(TATRFormatConfig(semantic_spanning_cells = False, enable_multi_header=False))


def get_page_text_with_tables(
    page: BasePage, tables: list[FormattedTable]
) -> FormattedPage:
    if not tables:
        return page._get_text_with_breaks()

    text_builder = []
    done = [False for _ in tables]
    for (
        x0,
        y0,
        x1,
        y1,
        word,
        blockno,
        lineno,
        wordno,
    ) in page._get_positions_and_text_and_breaks():
        for j, table in enumerate(tables):
            if table.rect.is_intersecting((x0, y0, x1, y1)):
                if not done[j]:
                    try:
                      table_content = table.df().fillna("").to_latex(index=False)
                      text_builder.append(f"\n{table_content}\n")
                    except:
                      # it throws errors when tables have no text
                      pass
                    done[j] = True
                break
        else:
            # no table found
            if wordno == 0:
                text_builder.append("\n")
                if lineno == 0:
                  text_builder.append("\n")
            else:
                text_builder.append(" ")
            text_builder.append(word)

    page_content = "".join(text_builder).lstrip()
    return page_content