| import json | |
| import os | |
| class TextEncoder: | |
| def __init__(self, vocab_path="vocab.json", by_char=True): | |
| self.vocab_path = vocab_path | |
| self.by_char = by_char | |
| self.vocab = {} | |
| self.inverse_vocab = {} | |
| if os.path.exists(vocab_path): | |
| self.load_vocab() | |
| def build_vocab(self, texts): | |
| items = set() | |
| for text in texts: | |
| processed_text = text.lower() | |
| tokens = list(processed_text) if self.by_char else processed_text.split() | |
| items.update(tokens) | |
| self.vocab = {token: idx for idx, token in enumerate(sorted(items), start=1)} | |
| self.inverse_vocab = {idx: token for token, idx in self.vocab.items()} | |
| self.save_vocab() | |
| def encode(self, text): | |
| processed_text = text.lower() | |
| tokens = list(processed_text) if self.by_char else processed_text.split() | |
| return [self.vocab[token] for token in tokens if token in self.vocab] | |
| def decode(self, encoded_list): | |
| return ''.join(self.inverse_vocab.get(num, '?') for num in encoded_list) if self.by_char \ | |
| else ' '.join(self.inverse_vocab.get(num, '?') for num in encoded_list) | |
| def save_vocab(self): | |
| with open(self.vocab_path, 'w') as f: | |
| json.dump(self.vocab, f) | |
| def load_vocab(self): | |
| with open(self.vocab_path, 'r') as f: | |
| self.vocab = json.load(f) | |
| self.inverse_vocab = {int(v): k for k, v in self.vocab.items()} | |
| if __name__ == "__main__": | |
| encoder = TextEncoder(by_char=False) | |
| with open("s.txt", "r") as f: | |
| texts = [line.strip() for line in f if line.strip()] | |
| encoder.build_vocab(texts) | |
| encoded = [] | |
| for text in texts: | |
| encoded.extend(encoder.encode(text)) | |
| with open("AgGPT10m.agmodel", "w") as f: | |
| json.dump(encoded, f) | |
| print("Encoding complete. Saved to AgGPT10m.agmodel.") | |
| decoded_text = encoder.decode(encoded) | |
| print(f"Decoded text (first 50 characters): {decoded_text[:50]}...") | |
| test_string = "othello" | |
| encoded_test = encoder.encode(test_string) | |
| decoded_test = encoder.decode(encoded_test) | |
| print(f"Original test string: '{test_string}'") | |
| print(f"Encoded test: {encoded_test}") | |
| print(f"Decoded test: '{decoded_test}'") | |
| test_string_2 = "iAgo" | |
| encoded_test_2 = encoder.encode(test_string_2) | |
| decoded_test_2 = encoder.decode(encoded_test_2) | |
| print(f"Original test string: '{test_string_2}'") | |
| print(f"Encoded test: {encoded_test_2}") | |
| print(f"Decoded test: '{decoded_test_2}'") |