import json import os class TextEncoder: def __init__(self, vocab_path="vocab.json", by_char=True): self.vocab_path = vocab_path self.by_char = by_char self.vocab = {} self.inverse_vocab = {} if os.path.exists(vocab_path): self.load_vocab() def build_vocab(self, texts): items = set() for text in texts: processed_text = text.lower() tokens = list(processed_text) if self.by_char else processed_text.split() items.update(tokens) self.vocab = {token: idx for idx, token in enumerate(sorted(items), start=1)} self.inverse_vocab = {idx: token for token, idx in self.vocab.items()} self.save_vocab() def encode(self, text): processed_text = text.lower() tokens = list(processed_text) if self.by_char else processed_text.split() return [self.vocab[token] for token in tokens if token in self.vocab] def decode(self, encoded_list): return ''.join(self.inverse_vocab.get(num, '?') for num in encoded_list) if self.by_char \ else ' '.join(self.inverse_vocab.get(num, '?') for num in encoded_list) def save_vocab(self): with open(self.vocab_path, 'w') as f: json.dump(self.vocab, f) def load_vocab(self): with open(self.vocab_path, 'r') as f: self.vocab = json.load(f) self.inverse_vocab = {int(v): k for k, v in self.vocab.items()} if __name__ == "__main__": encoder = TextEncoder(by_char=False) with open("s.txt", "r") as f: texts = [line.strip() for line in f if line.strip()] encoder.build_vocab(texts) encoded = [] for text in texts: encoded.extend(encoder.encode(text)) with open("AgGPT10m.agmodel", "w") as f: json.dump(encoded, f) print("Encoding complete. Saved to AgGPT10m.agmodel.") decoded_text = encoder.decode(encoded) print(f"Decoded text (first 50 characters): {decoded_text[:50]}...") test_string = "othello" encoded_test = encoder.encode(test_string) decoded_test = encoder.decode(encoded_test) print(f"Original test string: '{test_string}'") print(f"Encoded test: {encoded_test}") print(f"Decoded test: '{decoded_test}'") test_string_2 = "iAgo" encoded_test_2 = encoder.encode(test_string_2) decoded_test_2 = encoder.decode(encoded_test_2) print(f"Original test string: '{test_string_2}'") print(f"Encoded test: {encoded_test_2}") print(f"Decoded test: '{decoded_test_2}'")