Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import typing | |
| from collections import deque, defaultdict | |
| class Tokenizer(): | |
| def __init__(self): | |
| self.symbol_set : set = None | |
| self.symbol_to_token = {} | |
| self.token_to_symbol = {} | |
| self.language_size = 0 | |
| self.corpus = None | |
| def train_tokenizer(self, input, max_language_size: int) -> None: | |
| if type(input) == str: | |
| self.corpus = input.split(",") | |
| else: | |
| self.corpus = input | |
| self.symbol_set = set(self.corpus) | |
| for sym in self.symbol_set: | |
| self.symbol_to_token[sym] = self.language_size | |
| self.token_to_symbol[self.language_size] = sym | |
| self.language_size += 1 | |
| # Converted everythign to tokens from symbolic form | |
| self.corpus = np.array([self.symbol_to_token[sym] for sym in self.corpus], dtype=int) | |
| while self.language_size < max_language_size: | |
| temp_corpus = self.corpus | |
| common_pair = None | |
| highest_pair_count = 0 | |
| pair_counts = defaultdict(int) | |
| for i in range(len(temp_corpus)-1): | |
| pair = (temp_corpus[i], temp_corpus[i+1]) | |
| pair_counts[pair] += 1 | |
| if (pair_counts[pair] > highest_pair_count): | |
| highest_pair_count = pair_counts[pair] | |
| common_pair = pair | |
| synthetic_symbol = self.token_to_symbol[common_pair[0]] + self.token_to_symbol[common_pair[1]] | |
| self.symbol_to_token[synthetic_symbol] = self.language_size | |
| self.token_to_symbol[self.language_size] = synthetic_symbol | |
| self.language_size += 1 | |
| combine_tokens = deque(temp_corpus) | |
| self.corpus = [] | |
| while (len(combine_tokens) > 1): | |
| first_elem = combine_tokens.popleft() | |
| second_elem = combine_tokens.popleft() | |
| if ((first_elem, second_elem) == common_pair): | |
| combine_tokens.appendleft(self.language_size - 1) | |
| else: | |
| self.corpus.append(first_elem) | |
| self.corpus.append(second_elem) | |
| if (len(combine_tokens) > 0): | |
| self.corpus.append(combine_tokens.popleft()) | |
| self.corpus = None | |
| def decode(self, tokens: list[int]) -> str: | |
| return "".join([self.token_to_symbol[t] for t in tokens]) | |
| def encode(self, message: str): | |
| char_list = list(message) | |
| char_inputs = deque(char_list) | |
| result_tokens = [] | |
| curr_symbol = "" | |
| while (len(char_inputs) > 0): | |
| f_char = char_inputs.popleft() | |
| curr_symbol += f_char | |
| if (curr_symbol not in self.symbol_to_token.keys()): | |
| curr_symbol = curr_symbol[:-1] | |
| result_tokens.append(self.symbol_to_token[curr_symbol]) | |
| char_inputs.appendleft(f_char) | |
| curr_symbol = "" | |
| if (len(curr_symbol) > 0): | |
| result_tokens.append(self.symbol_to_token[curr_symbol]) | |
| return result_tokens | |
| def encode_moves(self, moves: list[str]) -> list[int]: | |
| return [self.symbol_to_token[move] for move in moves] | |
| def add_special_tokens(self, tokens: list[str]) -> dict[str, int]: | |
| mapping = {} | |
| for tok in tokens: | |
| self.symbol_to_token[tok] = self.language_size | |
| self.token_to_symbol[self.language_size] = tok | |
| mapping[tok] = self.language_size | |
| self.language_size += 1 | |
| return mapping | |
| class DataLoader(): | |
| corpus = None | |
| def __init__(self, file_name: str): | |
| with open(file_name, "r") as f: | |
| self.corpus = f.read() | |