English
AgGPT10m / encoder.py
AGofficial's picture
Upload 8 files
de00bef verified
import json
import os
class TextEncoder:
def __init__(self, vocab_path="vocab.json", by_char=True):
self.vocab_path = vocab_path
self.by_char = by_char
self.vocab = {}
self.inverse_vocab = {}
if os.path.exists(vocab_path):
self.load_vocab()
def build_vocab(self, texts):
items = set()
for text in texts:
processed_text = text.lower()
tokens = list(processed_text) if self.by_char else processed_text.split()
items.update(tokens)
self.vocab = {token: idx for idx, token in enumerate(sorted(items), start=1)}
self.inverse_vocab = {idx: token for token, idx in self.vocab.items()}
self.save_vocab()
def encode(self, text):
processed_text = text.lower()
tokens = list(processed_text) if self.by_char else processed_text.split()
return [self.vocab[token] for token in tokens if token in self.vocab]
def decode(self, encoded_list):
return ''.join(self.inverse_vocab.get(num, '?') for num in encoded_list) if self.by_char \
else ' '.join(self.inverse_vocab.get(num, '?') for num in encoded_list)
def save_vocab(self):
with open(self.vocab_path, 'w') as f:
json.dump(self.vocab, f)
def load_vocab(self):
with open(self.vocab_path, 'r') as f:
self.vocab = json.load(f)
self.inverse_vocab = {int(v): k for k, v in self.vocab.items()}
if __name__ == "__main__":
encoder = TextEncoder(by_char=False)
with open("s.txt", "r") as f:
texts = [line.strip() for line in f if line.strip()]
encoder.build_vocab(texts)
encoded = []
for text in texts:
encoded.extend(encoder.encode(text))
with open("AgGPT10m.agmodel", "w") as f:
json.dump(encoded, f)
print("Encoding complete. Saved to AgGPT10m.agmodel.")
decoded_text = encoder.decode(encoded)
print(f"Decoded text (first 50 characters): {decoded_text[:50]}...")
test_string = "othello"
encoded_test = encoder.encode(test_string)
decoded_test = encoder.decode(encoded_test)
print(f"Original test string: '{test_string}'")
print(f"Encoded test: {encoded_test}")
print(f"Decoded test: '{decoded_test}'")
test_string_2 = "iAgo"
encoded_test_2 = encoder.encode(test_string_2)
decoded_test_2 = encoder.decode(encoded_test_2)
print(f"Original test string: '{test_string_2}'")
print(f"Encoded test: {encoded_test_2}")
print(f"Decoded test: '{decoded_test_2}'")