Spaces:

vasilee
/

pdf-extract

Sleeping

pdf-extract / utils.py

extract text and tables

1a755c0 4 months ago

1.66 kB

	from gmft.formatters.base import FormattedTable
	from gmft.formatters.page.base import FormattedPage
	from gmft.pdf_bindings.base import BasePage

	from gmft.auto import AutoTableFormatter, AutoTableDetector, TATRFormatConfig
	from gmft.pdf_bindings import PyPDFium2Document

	detector = AutoTableDetector()
	formatter = AutoTableFormatter(TATRFormatConfig(semantic_spanning_cells = False, enable_multi_header=False))


	def get_page_text_with_tables(
	page: BasePage, tables: list[FormattedTable]
	) -> FormattedPage:
	if not tables:
	return page._get_text_with_breaks()

	text_builder = []
	done = [False for _ in tables]
	for (
	x0,
	y0,
	x1,
	y1,
	word,
	blockno,
	lineno,
	wordno,
	) in page._get_positions_and_text_and_breaks():
	for j, table in enumerate(tables):
	if table.rect.is_intersecting((x0, y0, x1, y1)):
	if not done[j]:
	try:
	table_content = table.df().fillna("").to_latex(index=False)
	text_builder.append(f"\n{table_content}\n")
	except:
	# it throws errors when tables have no text
	pass
	done[j] = True
	break
	else:
	# no table found
	if wordno == 0:
	text_builder.append("\n")
	if lineno == 0:
	text_builder.append("\n")
	else:
	text_builder.append(" ")
	text_builder.append(word)

	page_content = "".join(text_builder).lstrip()
	return page_content