edereynal
/

financial_bert

Model card Files Files and versions

financial_bert / financial_bert /table_utils.py

edereynal's picture

Initial commit: FinancialModernBERT package

dbb332c 26 days ago

history blame contribute delete

1.9 kB

	"""HTML table parsing and structural token IDs."""
	import re
	from typing import Dict, List, Tuple

	from bs4 import BeautifulSoup

	_TABLE_RE = re.compile(r'<table[^>]>.?</table>', re.DOTALL \| re.IGNORECASE)

	# Structural tokens for table boundaries and cell delimiters.
	# [unused0] and [unused1] are pre-allocated in ModernBERT's vocabulary
	# with random embeddings — they learn table semantics during fine-tuning.
	TABLE_START_ID = 50285 # [unused0]
	TABLE_END_ID = 50286 # [unused1]
	TAB_ID = 186 # \t — cell delimiter
	NEWLINE_ID = 187 # \n — row delimiter


	def parse_table_grid(table_html: str) -> List[List[Tuple[int, str]]]:
	"""Parse an HTML table into a grid of (col_index, cell_content) per row.

	Handles colspan and rowspan. Cell content preserves inner HTML (including
	<number> tags) but strips whitespace. Spanned cells are omitted.

	Returns: grid[row] = [(col_idx, content_html), ...]
	"""
	soup = BeautifulSoup(table_html, 'html.parser')
	trs = soup.find_all('tr')
	occupied: Dict[Tuple[int, int], bool] = {}
	grid: List[List[Tuple[int, str]]] = []

	for ri, tr in enumerate(trs):
	cells = []
	c = 0
	for cell in tr.find_all(['td', 'th']):
	while occupied.get((ri, c)):
	c += 1
	try:
	cs = int(re.sub(r'<[^>]+>', '', str(cell.get('colspan', 1))))
	except (ValueError, TypeError):
	cs = 1
	try:
	rs = int(re.sub(r'<[^>]+>', '', str(cell.get('rowspan', 1))))
	except (ValueError, TypeError):
	rs = 1
	content = cell.decode_contents().strip()
	cells.append((c, content))
	for dr in range(rs):
	for dc in range(cs):
	occupied[(ri + dr, c + dc)] = True
	c += cs
	grid.append(cells)

	return grid