Spaces:
Sleeping
Sleeping
| from gmft.formatters.base import FormattedTable | |
| from gmft.formatters.page.base import FormattedPage | |
| from gmft.pdf_bindings.base import BasePage | |
| from gmft.auto import AutoTableFormatter, AutoTableDetector, TATRFormatConfig | |
| from gmft.pdf_bindings import PyPDFium2Document | |
| detector = AutoTableDetector() | |
| formatter = AutoTableFormatter(TATRFormatConfig(semantic_spanning_cells = False, enable_multi_header=False)) | |
| def get_page_text_with_tables( | |
| page: BasePage, tables: list[FormattedTable] | |
| ) -> FormattedPage: | |
| if not tables: | |
| return page._get_text_with_breaks() | |
| text_builder = [] | |
| done = [False for _ in tables] | |
| for ( | |
| x0, | |
| y0, | |
| x1, | |
| y1, | |
| word, | |
| blockno, | |
| lineno, | |
| wordno, | |
| ) in page._get_positions_and_text_and_breaks(): | |
| for j, table in enumerate(tables): | |
| if table.rect.is_intersecting((x0, y0, x1, y1)): | |
| if not done[j]: | |
| try: | |
| table_content = table.df().fillna("").to_latex(index=False) | |
| text_builder.append(f"\n{table_content}\n") | |
| except: | |
| # it throws errors when tables have no text | |
| pass | |
| done[j] = True | |
| break | |
| else: | |
| # no table found | |
| if wordno == 0: | |
| text_builder.append("\n") | |
| if lineno == 0: | |
| text_builder.append("\n") | |
| else: | |
| text_builder.append(" ") | |
| text_builder.append(word) | |
| page_content = "".join(text_builder).lstrip() | |
| return page_content | |