pdf-extract / utils.py
vasilee's picture
extract text and tables
1a755c0
from gmft.formatters.base import FormattedTable
from gmft.formatters.page.base import FormattedPage
from gmft.pdf_bindings.base import BasePage
from gmft.auto import AutoTableFormatter, AutoTableDetector, TATRFormatConfig
from gmft.pdf_bindings import PyPDFium2Document
detector = AutoTableDetector()
formatter = AutoTableFormatter(TATRFormatConfig(semantic_spanning_cells = False, enable_multi_header=False))
def get_page_text_with_tables(
page: BasePage, tables: list[FormattedTable]
) -> FormattedPage:
if not tables:
return page._get_text_with_breaks()
text_builder = []
done = [False for _ in tables]
for (
x0,
y0,
x1,
y1,
word,
blockno,
lineno,
wordno,
) in page._get_positions_and_text_and_breaks():
for j, table in enumerate(tables):
if table.rect.is_intersecting((x0, y0, x1, y1)):
if not done[j]:
try:
table_content = table.df().fillna("").to_latex(index=False)
text_builder.append(f"\n{table_content}\n")
except:
# it throws errors when tables have no text
pass
done[j] = True
break
else:
# no table found
if wordno == 0:
text_builder.append("\n")
if lineno == 0:
text_builder.append("\n")
else:
text_builder.append(" ")
text_builder.append(word)
page_content = "".join(text_builder).lstrip()
return page_content