from gmft.formatters.base import FormattedTable from gmft.formatters.page.base import FormattedPage from gmft.pdf_bindings.base import BasePage from gmft.auto import AutoTableFormatter, AutoTableDetector, TATRFormatConfig from gmft.pdf_bindings import PyPDFium2Document detector = AutoTableDetector() formatter = AutoTableFormatter(TATRFormatConfig(semantic_spanning_cells = False, enable_multi_header=False)) def get_page_text_with_tables( page: BasePage, tables: list[FormattedTable] ) -> FormattedPage: if not tables: return page._get_text_with_breaks() text_builder = [] done = [False for _ in tables] for ( x0, y0, x1, y1, word, blockno, lineno, wordno, ) in page._get_positions_and_text_and_breaks(): for j, table in enumerate(tables): if table.rect.is_intersecting((x0, y0, x1, y1)): if not done[j]: try: table_content = table.df().fillna("").to_latex(index=False) text_builder.append(f"\n{table_content}\n") except: # it throws errors when tables have no text pass done[j] = True break else: # no table found if wordno == 0: text_builder.append("\n") if lineno == 0: text_builder.append("\n") else: text_builder.append(" ") text_builder.append(word) page_content = "".join(text_builder).lstrip() return page_content