"""HTML table parsing and structural token IDs.""" import re from typing import Dict, List, Tuple from bs4 import BeautifulSoup _TABLE_RE = re.compile(r']*>.*?', re.DOTALL | re.IGNORECASE) # Structural tokens for table boundaries and cell delimiters. # [unused0] and [unused1] are pre-allocated in ModernBERT's vocabulary # with random embeddings — they learn table semantics during fine-tuning. TABLE_START_ID = 50285 # [unused0] TABLE_END_ID = 50286 # [unused1] TAB_ID = 186 # \t — cell delimiter NEWLINE_ID = 187 # \n — row delimiter def parse_table_grid(table_html: str) -> List[List[Tuple[int, str]]]: """Parse an HTML table into a grid of (col_index, cell_content) per row. Handles colspan and rowspan. Cell content preserves inner HTML (including tags) but strips whitespace. Spanned cells are omitted. Returns: grid[row] = [(col_idx, content_html), ...] """ soup = BeautifulSoup(table_html, 'html.parser') trs = soup.find_all('tr') occupied: Dict[Tuple[int, int], bool] = {} grid: List[List[Tuple[int, str]]] = [] for ri, tr in enumerate(trs): cells = [] c = 0 for cell in tr.find_all(['td', 'th']): while occupied.get((ri, c)): c += 1 try: cs = int(re.sub(r'<[^>]+>', '', str(cell.get('colspan', 1)))) except (ValueError, TypeError): cs = 1 try: rs = int(re.sub(r'<[^>]+>', '', str(cell.get('rowspan', 1)))) except (ValueError, TypeError): rs = 1 content = cell.decode_contents().strip() cells.append((c, content)) for dr in range(rs): for dc in range(cs): occupied[(ri + dr, c + dc)] = True c += cs grid.append(cells) return grid