"""HTML table parsing and structural token IDs."""
import re
from typing import Dict, List, Tuple

from bs4 import BeautifulSoup

_TABLE_RE = re.compile(r'<table[^>]*>.*?</table>', re.DOTALL | re.IGNORECASE)

# Structural tokens for table boundaries and cell delimiters.
# [unused0] and [unused1] are pre-allocated in ModernBERT's vocabulary
# with random embeddings — they learn table semantics during fine-tuning.
TABLE_START_ID = 50285  # [unused0]
TABLE_END_ID = 50286    # [unused1]
TAB_ID = 186            # \t — cell delimiter
NEWLINE_ID = 187        # \n — row delimiter


def parse_table_grid(table_html: str) -> List[List[Tuple[int, str]]]:
    """Parse an HTML table into a grid of (col_index, cell_content) per row.

    Handles colspan and rowspan. Cell content preserves inner HTML (including
    <number> tags) but strips whitespace. Spanned cells are omitted.

    Returns: grid[row] = [(col_idx, content_html), ...]
    """
    soup = BeautifulSoup(table_html, 'html.parser')
    trs = soup.find_all('tr')
    occupied: Dict[Tuple[int, int], bool] = {}
    grid: List[List[Tuple[int, str]]] = []

    for ri, tr in enumerate(trs):
        cells = []
        c = 0
        for cell in tr.find_all(['td', 'th']):
            while occupied.get((ri, c)):
                c += 1
            try:
                cs = int(re.sub(r'<[^>]+>', '', str(cell.get('colspan', 1))))
            except (ValueError, TypeError):
                cs = 1
            try:
                rs = int(re.sub(r'<[^>]+>', '', str(cell.get('rowspan', 1))))
            except (ValueError, TypeError):
                rs = 1
            content = cell.decode_contents().strip()
            cells.append((c, content))
            for dr in range(rs):
                for dc in range(cs):
                    occupied[(ri + dr, c + dc)] = True
            c += cs
        grid.append(cells)

    return grid