Spaces:

MohamedSameh77i
/

Agent_PDF

Sleeping

Ag27 Deployer

Deploy Ag27 Table Extractor: 2026-04-29 19:38:38

df4a1a2 19 days ago

5.36 kB

	"""Export functions for table extraction annotations.

	Converts the annotation dict produced by ``pipeline.run_pipeline`` into
	HTML table snippets, CSV strings, JSON, or Excel workbooks.
	"""
	from __future__ import annotations

	import csv
	import io
	import json
	from html import escape


	def export_html(annotation: dict) -> str:
	"""Convert annotation to self-contained HTML ``<table>`` snippet(s).

	One ``<table>`` per detected table. Spanning cells use ``rowspan`` /
	``colspan`` attributes on ``<td>`` elements.
	"""
	html_parts: list[str] = []

	for table in annotation.get("tables", []):
	cells = table.get("cells", [])
	if not cells:
	continue

	max_row = max(c["row"] + c["row_span"] for c in cells)
	max_col = max(c["col"] + c["col_span"] for c in cells)

	# Positions covered by a span's non-anchor cells
	covered: set[tuple[int, int]] = set()
	for c in cells:
	if c["row_span"] > 1 or c["col_span"] > 1:
	for ri in range(c["row"], c["row"] + c["row_span"]):
	for ci in range(c["col"], c["col"] + c["col_span"]):
	if (ri, ci) != (c["row"], c["col"]):
	covered.add((ri, ci))

	cell_map = {(c["row"], c["col"]): c for c in cells}

	html_parts.append('<table border="1">')
	for r in range(max_row):
	html_parts.append(" <tr>")
	for col in range(max_col):
	if (r, col) in covered:
	continue
	cell = cell_map.get((r, col))
	if cell is None:
	html_parts.append(" <td></td>")
	continue
	attrs: list[str] = []
	if cell["row_span"] > 1:
	attrs.append(f'rowspan="{cell["row_span"]}"')
	if cell["col_span"] > 1:
	attrs.append(f'colspan="{cell["col_span"]}"')
	attr_str = (" " + " ".join(attrs)) if attrs else ""
	text = escape(cell.get("text", ""))
	html_parts.append(f" <td{attr_str}>{text}</td>")
	html_parts.append(" </tr>")
	html_parts.append("</table>")

	return "\n".join(html_parts)


	def export_csv(annotation: dict, table_id: int) -> str:
	"""Convert a single table to a UTF-8 CSV string.

	Spanning cells have their text in the top-left (anchor) cell; other
	covered positions are left empty.

	Raises ``ValueError`` if table_id is not found.
	"""
	table = None
	for t in annotation.get("tables", []):
	if t["table_id"] == table_id:
	table = t
	break
	if table is None:
	raise ValueError(f"Table ID {table_id} not found in annotation.")

	cells = table.get("cells", [])
	if not cells:
	return ""

	max_row = max(c["row"] + c["row_span"] for c in cells)
	max_col = max(c["col"] + c["col_span"] for c in cells)

	grid = [["" for _ in range(max_col)] for _ in range(max_row)]
	for c in cells:
	grid[c["row"]][c["col"]] = c.get("text", "")

	buf = io.StringIO()
	writer = csv.writer(buf)
	for row in grid:
	writer.writerow(row)
	return buf.getvalue()


	def export_json(annotation: dict) -> str:
	"""Serialize annotation to a formatted JSON string."""
	return json.dumps(annotation, indent=2, ensure_ascii=False)


	def export_excel(annotation: dict, output_path: str) -> None:
	"""Write annotation to an Excel workbook (one sheet per table).

	Requires ``openpyxl``.
	"""
	from openpyxl import Workbook

	wb = Workbook()
	wb.remove(wb.active) # remove default empty sheet

	for table in annotation.get("tables", []):
	ws = wb.create_sheet(title=f"Table {table['table_id']}")
	cells = table.get("cells", [])

	for cell in cells:
	r = cell["row"] + 1 # openpyxl is 1-indexed
	c = cell["col"] + 1
	ws.cell(row=r, column=c, value=cell.get("text", ""))
	if cell["row_span"] > 1 or cell["col_span"] > 1:
	ws.merge_cells(
	start_row=r, start_column=c,
	end_row=r + cell["row_span"] - 1,
	end_column=c + cell["col_span"] - 1,
	)

	wb.save(output_path)


	def export_csv_all(annotation: dict) -> str:
	"""Export all tables in a single CSV string.

	Each table is preceded by a marker row ``--- Table N ---`` and
	followed by a blank separator row. This satisfies the "one CSV
	with all tables" requirement.
	"""
	tables = annotation.get("tables", [])
	if not tables:
	return ""
	buf = io.StringIO()
	writer = csv.writer(buf)
	for idx, table in enumerate(tables):
	cells = table.get("cells", [])
	if not cells:
	continue
	# Marker row
	writer.writerow([f"--- Table {table.get('table_id', idx)} ---"])
	max_row = max(c["row"] + c["row_span"] for c in cells)
	max_col = max(c["col"] + c["col_span"] for c in cells)
	grid = [["" for _ in range(max_col)] for _ in range(max_row)]
	for c in cells:
	grid[c["row"]][c["col"]] = c.get("text", "")
	for row in grid:
	writer.writerow(row)
	writer.writerow([]) # blank separator
	return buf.getvalue()