Instructions to use edereynal/financial_bert with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use edereynal/financial_bert with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("edereynal/financial_bert", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """HTML table parsing and structural token IDs.""" | |
| import re | |
| from typing import Dict, List, Tuple | |
| from bs4 import BeautifulSoup | |
| _TABLE_RE = re.compile(r'<table[^>]*>.*?</table>', re.DOTALL | re.IGNORECASE) | |
| # Structural tokens for table boundaries and cell delimiters. | |
| # [unused0] and [unused1] are pre-allocated in ModernBERT's vocabulary | |
| # with random embeddings — they learn table semantics during fine-tuning. | |
| TABLE_START_ID = 50285 # [unused0] | |
| TABLE_END_ID = 50286 # [unused1] | |
| TAB_ID = 186 # \t — cell delimiter | |
| NEWLINE_ID = 187 # \n — row delimiter | |
| def parse_table_grid(table_html: str) -> List[List[Tuple[int, str]]]: | |
| """Parse an HTML table into a grid of (col_index, cell_content) per row. | |
| Handles colspan and rowspan. Cell content preserves inner HTML (including | |
| <number> tags) but strips whitespace. Spanned cells are omitted. | |
| Returns: grid[row] = [(col_idx, content_html), ...] | |
| """ | |
| soup = BeautifulSoup(table_html, 'html.parser') | |
| trs = soup.find_all('tr') | |
| occupied: Dict[Tuple[int, int], bool] = {} | |
| grid: List[List[Tuple[int, str]]] = [] | |
| for ri, tr in enumerate(trs): | |
| cells = [] | |
| c = 0 | |
| for cell in tr.find_all(['td', 'th']): | |
| while occupied.get((ri, c)): | |
| c += 1 | |
| try: | |
| cs = int(re.sub(r'<[^>]+>', '', str(cell.get('colspan', 1)))) | |
| except (ValueError, TypeError): | |
| cs = 1 | |
| try: | |
| rs = int(re.sub(r'<[^>]+>', '', str(cell.get('rowspan', 1)))) | |
| except (ValueError, TypeError): | |
| rs = 1 | |
| content = cell.decode_contents().strip() | |
| cells.append((c, content)) | |
| for dr in range(rs): | |
| for dc in range(cs): | |
| occupied[(ri + dr, c + dc)] = True | |
| c += cs | |
| grid.append(cells) | |
| return grid | |