| from pathlib import Path |
|
|
| from gmft.auto import AutoFormatConfig, AutoTableFormatter, CroppedTable, TableDetector |
| from gmft.pdf_bindings import PyPDFium2Document |
|
|
| from .settings import ENABLE_DEBUG_MODE |
|
|
| detector = TableDetector() |
| config = AutoFormatConfig() |
| config.semantic_spanning_cells = True |
| config.enable_multi_header = True |
| formatter = AutoTableFormatter(config) |
|
|
|
|
| GMFT_DEBUG_PATH = Path("/tmp/gmft") |
| GMFT_DEBUG_PATH.mkdir(exist_ok=True) |
|
|
|
|
| def ingest_pdf(pdf_path) -> list[CroppedTable]: |
| doc = PyPDFium2Document(pdf_path) |
|
|
| tables = [] |
| for page in doc: |
| tables += detector.extract(page) |
| return tables |
|
|
|
|
| def convert_gmft(path: str, file_name: str): |
| tables = ingest_pdf(path) |
| formatted_tables = [] |
| debug_image_paths = [] |
|
|
| debug_path = GMFT_DEBUG_PATH / file_name |
| debug_path.mkdir(exist_ok=True) |
|
|
| for idx, table in enumerate(tables): |
| ft = formatter.extract( |
| table, |
| dpi=72 * 2, |
| ) |
| df = ft.df() |
| if df is not None: |
| html = df.fillna("").to_html( |
| index=False, |
| ) |
| formatted_tables.append(html) |
|
|
| if ENABLE_DEBUG_MODE: |
| image_path = debug_path / f"table_{idx}.png" |
| ft.image().save(image_path) |
| debug_image_paths.append(image_path) |
|
|
| content = "\n\n".join(formatted_tables) |
| return content, debug_image_paths |
|
|