AGofficial commited on
Commit
f227bad
·
verified ·
1 Parent(s): 866019a

Upload 11 files

Browse files
Files changed (12) hide show
  1. .gitattributes +3 -0
  2. AGWM.json +3 -0
  3. AGWM.py +90 -0
  4. LICENSE +21 -0
  5. README.md +60 -3
  6. aggpt14.py +263 -0
  7. banner.png +3 -0
  8. chat.py +18 -0
  9. main.py +16 -0
  10. training_data.py +2 -0
  11. training_data/WM.txt +0 -0
  12. training_data/corpus.txt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ AGWM.json filter=lfs diff=lfs merge=lfs -text
37
+ banner.png filter=lfs diff=lfs merge=lfs -text
38
+ training_data/corpus.txt filter=lfs diff=lfs merge=lfs -text
AGWM.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca87f0b163dd488d8baa2020a4af457f44bdd5bbc37583ae29fa7bcfdbe7575
3
+ size 14511404
AGWM.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import re
3
+ import os
4
+ import json
5
+ from collections import defaultdict, Counter
6
+
7
+ class MarkovChain:
8
+ def __init__(self):
9
+ self.model = defaultdict(Counter)
10
+ self.starting_keys = []
11
+
12
+ def train(self, text):
13
+ words = re.findall(r'\b\w+\b|[.!?]', text)
14
+ for i in range(len(words) - 5):
15
+ w1, w2, w3, w4, w5 = words[i], words[i + 1], words[i + 2], words[i + 3], words[i + 4]
16
+ key = (w1, w2, w3, w4)
17
+ self.model[key][w5] += 1
18
+ if w1[0].isupper() and (i == 0 or words[i - 1] in '.!?'):
19
+ self.starting_keys.append(key)
20
+
21
+ def generate(self, min_sentences=2, max_length=100):
22
+ if not self.starting_keys:
23
+ raise ValueError("No valid sentence starters found.")
24
+ key = random.choice(self.starting_keys)
25
+ result = [key[0], key[1], key[2], key[3]]
26
+ sentence_count = 0
27
+
28
+ for _ in range(max_length - 4):
29
+ next_words = self.model.get(key)
30
+ if not next_words:
31
+ break
32
+ words, weights = zip(*next_words.items())
33
+ next_word = random.choices(words, weights=weights, k=1)[0]
34
+ result.append(next_word)
35
+ if next_word in '.!?':
36
+ sentence_count += 1
37
+ if sentence_count >= min_sentences:
38
+ break
39
+ key = (key[1], key[2], key[3], next_word)
40
+
41
+ text = ' '.join(result)
42
+ text = re.sub(r'\s+([.!?])', r'\1', text)
43
+ return text
44
+
45
+ def save_to_json(self, filename):
46
+ data = {
47
+ "model": {
48
+ ",".join(k): {word: count for word, count in counter.items()}
49
+ for k, counter in self.model.items()
50
+ },
51
+ "starting_keys": [",".join(k) for k in self.starting_keys]
52
+ }
53
+ with open(filename, "w", encoding="utf-8") as f:
54
+ json.dump(data, f)
55
+ print(f"Model saved to {filename}")
56
+
57
+ def load_from_json(self, filename):
58
+ with open(filename, "r", encoding="utf-8") as f:
59
+ data = json.load(f)
60
+ self.model = defaultdict(Counter, {
61
+ tuple(k.split(",")): Counter(v) for k, v in data["model"].items()
62
+ })
63
+ self.starting_keys = [tuple(k.split(",")) for k in data["starting_keys"]]
64
+ print(f"Model loaded from {filename}")
65
+
66
+ def train_and_save_model(filename_text, filename_json_model):
67
+ with open(filename_text, "r", encoding="utf-8") as f:
68
+ text = f.read()
69
+
70
+ chain = MarkovChain()
71
+ chain.train(text)
72
+ chain.save_to_json(filename_json_model)
73
+ return chain
74
+
75
+ def load_model(filename_json_model):
76
+ chain = MarkovChain()
77
+ chain.load_from_json(filename_json_model)
78
+ return chain
79
+
80
+
81
+ if __name__ == "__main__":
82
+ text_file = "training_data/WM.txt"
83
+ model_file = "AGWM.json"
84
+
85
+ if os.path.exists(model_file):
86
+ chain = load_model(model_file)
87
+ else:
88
+ chain = train_and_save_model(text_file, model_file)
89
+
90
+ print(chain.generate(min_sentences=3))
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,60 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ ---
6
+
7
+ # AgGPT-14
8
+
9
+ <img src="banner.png" alt="AgGPT-14 Banner" width="100%">
10
+
11
+ ## Light. Pro. Smart.
12
+
13
+ AgGPT-14 is our state of the art language model.
14
+
15
+ # AgGPT-14: Advanced Generative Conversational AI
16
+
17
+ AgGPT-14 is a lightweight, Python-based AI model designed for conversational tasks with context-aware responses. It combines n-gram style Markov chains with a similarity-driven context selection mechanism, providing coherent and human-like responses based on a training corpus.
18
+
19
+ ---
20
+
21
+ ## Features
22
+
23
+ 1. **Deterministic Context Matching**
24
+ - Uses an aggressive TF-IDF inspired similarity scoring combined with Longest Common Subsequence (LCS) detection to find the best matching user query from the training corpus.
25
+ - Ensures responses are relevant to the user's input.
26
+ - Ensures responses are relevant to the user's input.
27
+
28
+ 2. **World Model Integration**
29
+ - Generates simple "world model" text to enhance conversational depth when enabled.
30
+ - Can prepend or combine world model outputs with AI responses.
31
+
32
+ 3. **N-Gram AI Response Generation**
33
+ - Generates responses using an n-gram Markov model (configurable `order`) built from AI responses in the training corpus.
34
+ - Supports temperature-based sampling and top-k filtering for diverse outputs.
35
+
36
+ 4. **Text Normalization**
37
+ - Expands common contractions.
38
+ - Tokenizes text into clean, lowercase tokens.
39
+ - Detokenizes output with proper punctuation and capitalization.
40
+
41
+ 5. **IDF-Weighted Matching**
42
+ - Emphasizes rare words in similarity scoring to capture nuanced user queries.
43
+
44
+ 6. **Debugging Support**
45
+ - Provides detailed debug information about tokenization, similarity scores, and context selection for each user query.
46
+
47
+ ---
48
+
49
+ # Notes
50
+
51
+ Designed for offline usage; no external API calls required.
52
+ Lightweight and fast; ideal for experimentation and educational purposes.
53
+ Can be easily extended with more advanced NLP techniques for higher-quality responses.
54
+
55
+ We noted that this model does not perform as well as traditional transformer-based models like GPT-3.5 or GPT-4, but it is designed to be lightweight. We also noted that this model is not as scalable, so further research and development is needed to improve its performance and scalability to match AgGPT-9 and AgGPT-10 performance, which is inherently more scalable but also more complex and resource-intensive, as it is a full transformer model. It is my goal to create a model that is lightweight, fast, and easy to use, while still providing high-quality responses, but also to make the model not be a black box, like most GPT models, so that it can be easily understood and modified by developers and researchers.
56
+
57
+ ## License
58
+
59
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
60
+
aggpt14.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import random
3
+ from collections import Counter, defaultdict
4
+ from training_data import corpus
5
+ from AGWM import *
6
+
7
+
8
+ ModelName = 'AgGPT-14'
9
+
10
+
11
+ def world_model(length =10):
12
+ """Generates a simple world model for demonstration purposes."""
13
+
14
+ text_file = "training_data/WM.txt"
15
+ model_file = "AGWM.json"
16
+
17
+ if os.path.exists(model_file):
18
+ chain = load_model(model_file)
19
+ else:
20
+ chain = train_and_save_model(text_file, model_file)
21
+
22
+ return chain.generate(min_sentences=length)
23
+
24
+ class AgGPT14:
25
+ def __init__(self, corpus_text, order=3, seed=None):
26
+ assert order >= 1, "order must be >= 1"
27
+ self.model_name = ModelName
28
+ self.order = order
29
+ self.rng = random.Random(seed)
30
+
31
+ self.pairs = self._parse_pairs(corpus_text)
32
+ if not self.pairs:
33
+ raise ValueError("No (user, ai) pairs found in corpus.")
34
+
35
+ self.user_docs = [self._tokenize(u) for u, _ in self.pairs]
36
+ self.ai_docs = [self._tokenize(a) for _, a in self.pairs]
37
+
38
+ self.idf_weights = self._calculate_idf(self.user_docs)
39
+
40
+ self.global_transitions = self._build_global_transitions(self.ai_docs)
41
+ self.unigram = self._build_unigram(self.ai_docs)
42
+
43
+ self.user_ai_pairs = list(zip(self.user_docs, self.ai_docs))
44
+
45
+ def _calculate_idf(self, docs):
46
+ """Calculates an aggressive IDF score to emphasize rare words."""
47
+ N = len(docs)
48
+ doc_freq = Counter()
49
+ for doc in docs:
50
+ for word in set(doc):
51
+ doc_freq[word] += 1
52
+
53
+ idf = {word: (N / (count + 1)) ** 2 for word, count in doc_freq.items()}
54
+ return idf
55
+
56
+ def _lcs(self, a, b):
57
+ """Finds the Longest Common Subsequence between two lists of tokens."""
58
+ lengths = [[0 for j in range(len(b) + 1)] for i in range(len(a) + 1)]
59
+ for i, x in enumerate(a):
60
+ for j, y in enumerate(b):
61
+ if x == y:
62
+ lengths[i + 1][j + 1] = lengths[i][j] + 1
63
+ else:
64
+ lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
65
+ result = []
66
+ x, y = len(a), len(b)
67
+ while x != 0 and y != 0:
68
+ if lengths[x][y] == lengths[x - 1][y]:
69
+ x -= 1
70
+ elif lengths[x][y] == lengths[x][y - 1]:
71
+ y -= 1
72
+ else:
73
+ result.append(a[x - 1])
74
+ x -= 1
75
+ y -= 1
76
+ return result[::-1]
77
+
78
+ def _parse_pairs(self, text):
79
+ pattern = re.compile(
80
+ r"user:\s*(.*?)\s*<pad>\s*ai:\s*(.*?)\s*<eos>",
81
+ re.DOTALL | re.IGNORECASE
82
+ )
83
+ pairs = []
84
+ for u, a in pattern.findall(text):
85
+ u, a = u.strip(), a.strip()
86
+ if u and a:
87
+ pairs.append((u, a))
88
+ return pairs
89
+
90
+ def _expand_contractions(self, s):
91
+ s = re.sub(r"what's", "what is", s)
92
+ s = re.sub(r"that's", "that is", s)
93
+ s = re.sub(r"it's", "it is", s)
94
+ s = re.sub(r"how's", "how is", s)
95
+ s = re.sub(r"he's", "he is", s)
96
+ s = re.sub(r"she's", "she is", s)
97
+ s = re.sub(r"you're", "you are", s)
98
+ s = re.sub(r"i'm", "i am", s)
99
+ s = re.sub(r"didn't", "did not", s)
100
+ s = re.sub(r"don't", "do not", s)
101
+ s = re.sub(r"can't", "cannot", s)
102
+ return s
103
+
104
+ def _tokenize(self, s):
105
+ s = s.strip().lower()
106
+ s = self._expand_contractions(s)
107
+ tokens = re.findall(r"[a-z]+(?:'[a-z]+)?|[?.!,;:]", s)
108
+ return [t for t in tokens if t]
109
+
110
+ def _with_bounds(self, tokens):
111
+ return ["<s>"] * self.order + tokens + ["</s>"]
112
+
113
+ def _similarity(self, query_tokens, doc_tokens):
114
+ if not query_tokens or not doc_tokens:
115
+ return 0.0
116
+ common_words = set(query_tokens).intersection(set(doc_tokens))
117
+ if not common_words:
118
+ return 0.0
119
+ idf_score = sum(self.idf_weights.get(word, 0.1) for word in common_words)
120
+ lcs = self._lcs(query_tokens, doc_tokens)
121
+ order_bonus_factor = 0.5
122
+ order_bonus = sum(self.idf_weights.get(word, 0.1) for word in lcs) * order_bonus_factor
123
+ return idf_score + order_bonus
124
+
125
+ def _find_best_match(self, user_text):
126
+ q_tokens = self._tokenize(user_text)
127
+ if not q_tokens:
128
+ return None
129
+
130
+ best_score = -1.0
131
+ best_idx = -1
132
+ for i, user_doc in enumerate(self.user_docs):
133
+ sim = self._similarity(q_tokens, user_doc)
134
+ if sim > best_score:
135
+ best_score = sim
136
+ best_idx = i
137
+
138
+ if best_idx == -1 or best_score < 0.1:
139
+ return None
140
+ return best_idx
141
+
142
+ def _build_global_transitions(self, docs):
143
+ trans = defaultdict(Counter)
144
+ for tokens in docs:
145
+ seq = self._with_bounds(tokens)
146
+ for i in range(len(seq) - self.order):
147
+ ctx = tuple(seq[i : i + self.order])
148
+ nxt = seq[i + self.order]
149
+ trans[ctx][nxt] += 1
150
+ return trans
151
+
152
+ def _build_unigram(self, docs):
153
+ uni = Counter()
154
+ for d in docs:
155
+ uni.update(d)
156
+ return uni
157
+
158
+ def _get_best_starting_context(self, user_text):
159
+ """Finds the best match and deterministically returns its starting context."""
160
+ best_match_idx = self._find_best_match(user_text)
161
+
162
+ if best_match_idx is not None:
163
+ ai_doc = self.ai_docs[best_match_idx]
164
+ if len(ai_doc) >= self.order:
165
+ return tuple(ai_doc[:self.order])
166
+
167
+ return tuple(["<s>"] * self.order)
168
+
169
+ def _sample_next(self, context, temperature, top_k):
170
+ ctx = context
171
+ while len(ctx) > 0:
172
+ if ctx in self.global_transitions and self.global_transitions[ctx]:
173
+ counter = self.global_transitions[ctx]
174
+ break
175
+ ctx = ctx[1:]
176
+ else:
177
+ counter = Counter({k: v for k, v in self.unigram.items() if k not in ["<s>", "</s>"]})
178
+
179
+ if not counter: return "</s>"
180
+ items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[:top_k]
181
+ if not items: return "</s>"
182
+ if temperature <= 0: return items[0][0]
183
+
184
+ tokens, weights = zip(*items)
185
+ scaled_weights = [w ** (1.0 / temperature) for w in weights]
186
+ return self.rng.choices(tokens, weights=scaled_weights, k=1)[0]
187
+
188
+ def _detokenize(self, tokens):
189
+ if not tokens: return ""
190
+ text = " ".join(t for t in tokens if t not in ["<s>", "</s>"])
191
+ text = re.sub(r'\s+([?.!,;:])', r'\1', text)
192
+ text = re.sub(r" ([']) ", r"\1", text)
193
+ if text: text = text[0].upper() + text[1:]
194
+ text = re.sub(r'([.!?]\s*)([a-z])', lambda m: m.group(1) + m.group(2).upper(), text)
195
+ text = re.sub(r'\bi\b', 'I', text)
196
+ return text
197
+
198
+ def respond(self, user_text, max_tokens=25, temperature=0.7, top_k=8, use_context_selection=True):
199
+ ctx = self._get_best_starting_context(user_text) if use_context_selection else tuple(["<s>"] * self.order)
200
+ out = list(ctx)
201
+ for _ in range(max_tokens):
202
+ nxt = self._sample_next(ctx, temperature, top_k)
203
+ if nxt == "</s>": break
204
+ out.append(nxt)
205
+ ctx = tuple(out[-self.order:])
206
+ return self._detokenize(out)
207
+
208
+ def ask(self, prompt, text_world_model=False, **kwargs):
209
+ """User-friendly wrapper for the respond method."""
210
+ response = self.respond(prompt, **kwargs)
211
+
212
+ if text_world_model:
213
+ wm_response = world_model(length=10)
214
+ wm_response = "<world_model>" + wm_response + "</world_model>"
215
+ response = wm_response + " " + response
216
+
217
+ return response
218
+
219
+ def get_debug_info(self, user_text):
220
+ q_tokens = self._tokenize(user_text)
221
+ print(f"--- Debug info for: '{user_text}' ---")
222
+ print(f"Query Tokens (after normalization): {q_tokens}\n")
223
+
224
+ best_match_idx = self._find_best_match(user_text)
225
+
226
+ if best_match_idx is not None:
227
+ best_score = self._similarity(q_tokens, self.user_docs[best_match_idx])
228
+ print("Determined Best Match:")
229
+ print(f" - Corpus Entry: {' '.join(self.user_docs[best_match_idx])}")
230
+ print(f" - Score: {best_score:.2f}")
231
+ print(f" - Corresponding AI response will be used for context.")
232
+ else:
233
+ print("No suitable match found. Will use default starting context.")
234
+
235
+
236
+ if __name__ == "__main__":
237
+ print(f"Initializing model: {ModelName}")
238
+ bot = AgGPT14(corpus, order=3, seed=42)
239
+
240
+ print("\n=== Demonstrating the Fix for 'color' query ===")
241
+ bot.get_debug_info("what is your favorite color?")
242
+
243
+ print("\n=== Testing Model with Deterministic Matching ===")
244
+ tests = [
245
+ "hi",
246
+ "tell me a joke",
247
+ "do you have hobbies?",
248
+ "what is your favorite color?",
249
+ "thanks a lot",
250
+ ]
251
+ for t in tests:
252
+ print(f"user: {t}")
253
+ response = bot.ask(t)
254
+ print(f"ai: {response}")
255
+ print("-" * 40)
256
+
257
+ print("====WORLD MODEL====")
258
+ print(world_model())
259
+
260
+ prompt = "hello, how are you?"
261
+ print(f"\nPrompt: {prompt}")
262
+ response = bot.ask(prompt, max_tokens=20, temperature=0.5, top_k=5, text_world_model=True)
263
+ print(f"Response: {response}")
banner.png ADDED

Git LFS Details

  • SHA256: e1e0af349a46dbbc7fbabf9072b7075eb64accae7c3f85f952cfdf2a184261f5
  • Pointer size: 131 Bytes
  • Size of remote file: 519 kB
chat.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from aggpt14 import AgGPT14
2
+ from training_data import corpus
3
+
4
+ if __name__ == "__main__":
5
+ model = AgGPT14(corpus, order=4, seed=None)
6
+
7
+ print("Chat with AgGPT14 (type 'quit' to exit)")
8
+ print("-" * 40)
9
+
10
+ while True:
11
+ prompt = input("You: ")
12
+
13
+ if prompt.lower() in ['quit', 'exit', 'q']:
14
+ print("Goodbye!")
15
+ break
16
+
17
+ response = model.ask(prompt, max_tokens=999999, temperature=0.5, top_k=5, text_world_model=False)
18
+ print(f"AI: {response}\n")
main.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from aggpt14 import AgGPT14
2
+ from training_data import corpus
3
+
4
+ if __name__ == "__main__":
5
+ model = AgGPT14(corpus, order=3, seed=None)
6
+
7
+ prompt = "What is your favorite color?"
8
+ print(f"User: {prompt}")
9
+ response = model.ask(prompt)
10
+ print(f"AI: {response}")
11
+
12
+
13
+ prompt = "hello, how are you?"
14
+ print(f"\nPrompt: {prompt}")
15
+ response = model.ask(prompt, max_tokens=20, temperature=0.5, top_k=5, text_world_model=True)
16
+ print(f"Response: {response}")
training_data.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ with open("training_data/corpus.txt", "r", encoding="utf-8") as file:
2
+ corpus = file.read()
training_data/WM.txt ADDED
The diff for this file is too large to render. See raw diff
 
training_data/corpus.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20c3209640ce4cf4efffa0ece52852e606e40c32892c1a65fe8ed46934b109b8
3
+ size 49492881