financial_bert / financial_bert /table_utils.py
edereynal's picture
Initial commit: FinancialModernBERT package
dbb332c
"""HTML table parsing and structural token IDs."""
import re
from typing import Dict, List, Tuple
from bs4 import BeautifulSoup
_TABLE_RE = re.compile(r'<table[^>]*>.*?</table>', re.DOTALL | re.IGNORECASE)
# Structural tokens for table boundaries and cell delimiters.
# [unused0] and [unused1] are pre-allocated in ModernBERT's vocabulary
# with random embeddings — they learn table semantics during fine-tuning.
TABLE_START_ID = 50285 # [unused0]
TABLE_END_ID = 50286 # [unused1]
TAB_ID = 186 # \t — cell delimiter
NEWLINE_ID = 187 # \n — row delimiter
def parse_table_grid(table_html: str) -> List[List[Tuple[int, str]]]:
"""Parse an HTML table into a grid of (col_index, cell_content) per row.
Handles colspan and rowspan. Cell content preserves inner HTML (including
<number> tags) but strips whitespace. Spanned cells are omitted.
Returns: grid[row] = [(col_idx, content_html), ...]
"""
soup = BeautifulSoup(table_html, 'html.parser')
trs = soup.find_all('tr')
occupied: Dict[Tuple[int, int], bool] = {}
grid: List[List[Tuple[int, str]]] = []
for ri, tr in enumerate(trs):
cells = []
c = 0
for cell in tr.find_all(['td', 'th']):
while occupied.get((ri, c)):
c += 1
try:
cs = int(re.sub(r'<[^>]+>', '', str(cell.get('colspan', 1))))
except (ValueError, TypeError):
cs = 1
try:
rs = int(re.sub(r'<[^>]+>', '', str(cell.get('rowspan', 1))))
except (ValueError, TypeError):
rs = 1
content = cell.decode_contents().strip()
cells.append((c, content))
for dr in range(rs):
for dc in range(cs):
occupied[(ri + dr, c + dc)] = True
c += cs
grid.append(cells)
return grid