| import os |
| import json |
| import subprocess |
| from pathlib import Path |
| from typing import Callable, Dict, List, Optional, Union |
|
|
| from PIL import Image |
| from pdf2image import convert_from_path |
|
|
| def show(output): |
|
|
| doc = "\documentclass[varwidth=\maxdimen]{standalone}\n\\usepackage{multirow, multicol, booktabs}\n\\begin{document}\n\\begin{tabular}%s\end{tabular}\n\end{document}" |
| code = doc % output |
| if os.path.exists('tmp.pdf'): |
| os.remove('tmp.pdf') |
| if os.path.exists('tmp.png'): |
| os.remove('tmp.png') |
| with open('tmp.tex', 'w+') as of: |
| of.write(code) |
| try: |
| subprocess.run(["pdflatex", 'tmp.tex']) |
| img = convert_from_path('tmp.pdf', fmt='png')[0] |
| img.save('tmp.png', 'PNG') |
| return Image.open('tmp.png').resize((200, 200)) |
| except Exception as e: |
| print(e) |
| return None |
| |
| def pil_loader(img, mode): |
| return img.convert(mode=mode) |
|
|
|
|
| class Tokenizer: |
| def __init__(self, token_to_index: Optional[Dict[str, int]] = None) -> None: |
| self.pad_token = "<PAD>" |
| self.sos_token = "<SOS>" |
| self.eos_token = "<EOS>" |
| self.unk_token = "<UNK>" |
|
|
| assert token_to_index, "vocabulary with mapping from token to id?" |
| self.token_to_index: Dict[str, int] |
| self.index_to_token: Dict[int, str] |
|
|
| self.token_to_index = token_to_index |
| self.index_to_token = {index: token for token, index in self.token_to_index.items()} |
| self.pad_index = self.token_to_index[self.pad_token] |
| self.sos_index = self.token_to_index[self.sos_token] |
| self.eos_index = self.token_to_index[self.eos_token] |
| self.unk_index = self.token_to_index[self.unk_token] |
|
|
| self.ignore_indices = {self.pad_index, self.sos_index, self.eos_index, self.unk_index} |
|
|
| def __len__(self): |
| return len(self.token_to_index) |
|
|
| def encode(self, formula: List[str]) -> List[int]: |
| indices = [self.sos_index] |
| for token in formula: |
| index = self.token_to_index.get(token, self.unk_index) |
| indices.append(index) |
| indices.append(self.eos_index) |
| return indices |
|
|
| def decode(self, indices: List[int], inference: bool = True) -> List[str]: |
| tokens = [] |
| for index in indices: |
| if index not in self.index_to_token: |
| raise RuntimeError(f"Found an unknown index {index}") |
| if index == self.eos_index: |
| break |
| if inference and index in self.ignore_indices: |
| continue |
| token = self.index_to_token[index] |
| tokens.append(token) |
| return tokens |
|
|
| @classmethod |
| def load(cls, filename: Union[Path, str]) -> "Tokenizer": |
| """Create a `Tokenizer` from a mapping file outputted by `save`. |
| |
| Args: |
| filename: Path to the file to read from. |
| |
| Returns: |
| A `Tokenizer` object. |
| """ |
| with open(filename) as f: |
| token_to_index = json.load(f) |
| return cls(token_to_index) |
|
|