img2tex / util.py
iankur's picture
Update util.py
f4b6269
import os
import json
import subprocess
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from PIL import Image
from pdf2image import convert_from_path
def show(output):
doc = "\documentclass[varwidth=\maxdimen]{standalone}\n\\usepackage{multirow, multicol, booktabs}\n\\begin{document}\n\\begin{tabular}%s\end{tabular}\n\end{document}"
code = doc % output
if os.path.exists('tmp.pdf'):
os.remove('tmp.pdf')
if os.path.exists('tmp.png'):
os.remove('tmp.png')
with open('tmp.tex', 'w+') as of:
of.write(code)
try:
subprocess.run(["pdflatex", 'tmp.tex'])
img = convert_from_path('tmp.pdf', fmt='png')[0]
img.save('tmp.png', 'PNG')
return Image.open('tmp.png').resize((200, 200))
except Exception as e:
print(e)
return None
def pil_loader(img, mode):
return img.convert(mode=mode)
class Tokenizer:
def __init__(self, token_to_index: Optional[Dict[str, int]] = None) -> None:
self.pad_token = "<PAD>"
self.sos_token = "<SOS>"
self.eos_token = "<EOS>"
self.unk_token = "<UNK>"
assert token_to_index, "vocabulary with mapping from token to id?"
self.token_to_index: Dict[str, int]
self.index_to_token: Dict[int, str]
self.token_to_index = token_to_index
self.index_to_token = {index: token for token, index in self.token_to_index.items()}
self.pad_index = self.token_to_index[self.pad_token]
self.sos_index = self.token_to_index[self.sos_token]
self.eos_index = self.token_to_index[self.eos_token]
self.unk_index = self.token_to_index[self.unk_token]
self.ignore_indices = {self.pad_index, self.sos_index, self.eos_index, self.unk_index}
def __len__(self):
return len(self.token_to_index)
def encode(self, formula: List[str]) -> List[int]:
indices = [self.sos_index]
for token in formula:
index = self.token_to_index.get(token, self.unk_index)
indices.append(index)
indices.append(self.eos_index)
return indices
def decode(self, indices: List[int], inference: bool = True) -> List[str]:
tokens = []
for index in indices:
if index not in self.index_to_token:
raise RuntimeError(f"Found an unknown index {index}")
if index == self.eos_index:
break
if inference and index in self.ignore_indices:
continue
token = self.index_to_token[index]
tokens.append(token)
return tokens
@classmethod
def load(cls, filename: Union[Path, str]) -> "Tokenizer":
"""Create a `Tokenizer` from a mapping file outputted by `save`.
Args:
filename: Path to the file to read from.
Returns:
A `Tokenizer` object.
"""
with open(filename) as f:
token_to_index = json.load(f)
return cls(token_to_index)