Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +3 -0
AGWM.json +3 -0
AGWM.py +90 -0
LICENSE +21 -0
README.md +60 -3
aggpt14.py +263 -0
banner.png +3 -0
chat.py +18 -0
main.py +16 -0
training_data.py +2 -0
training_data/WM.txt +0 -0
training_data/corpus.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+AGWM.json filter=lfs diff=lfs merge=lfs -text
+banner.png filter=lfs diff=lfs merge=lfs -text
+training_data/corpus.txt filter=lfs diff=lfs merge=lfs -text

AGWM.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cca87f0b163dd488d8baa2020a4af457f44bdd5bbc37583ae29fa7bcfdbe7575
+size 14511404

AGWM.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import random
+import re
+import os
+import json
+from collections import defaultdict, Counter
+class MarkovChain:
+    def __init__(self):
+        self.model = defaultdict(Counter)
+        self.starting_keys = []
+    def train(self, text):
+        words = re.findall(r'\b\w+\b|[.!?]', text)
+        for i in range(len(words) - 5):
+            w1, w2, w3, w4, w5 = words[i], words[i + 1], words[i + 2], words[i + 3], words[i + 4]
+            key = (w1, w2, w3, w4)
+            self.model[key][w5] += 1
+            if w1[0].isupper() and (i == 0 or words[i - 1] in '.!?'):
+                self.starting_keys.append(key)
+    def generate(self, min_sentences=2, max_length=100):
+        if not self.starting_keys:
+            raise ValueError("No valid sentence starters found.")
+        key = random.choice(self.starting_keys)
+        result = [key[0], key[1], key[2], key[3]]
+        sentence_count = 0
+        for _ in range(max_length - 4):
+            next_words = self.model.get(key)
+            if not next_words:
+                break
+            words, weights = zip(*next_words.items())
+            next_word = random.choices(words, weights=weights, k=1)[0]
+            result.append(next_word)
+            if next_word in '.!?':
+                sentence_count += 1
+                if sentence_count >= min_sentences:
+                    break
+            key = (key[1], key[2], key[3], next_word)
+        text = ' '.join(result)
+        text = re.sub(r'\s+([.!?])', r'\1', text)
+        return text
+    def save_to_json(self, filename):
+        data = {
+            "model": {
+                ",".join(k): {word: count for word, count in counter.items()}
+                for k, counter in self.model.items()
+            },
+            "starting_keys": [",".join(k) for k in self.starting_keys]
+        }
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(data, f)
+        print(f"Model saved to {filename}")
+    def load_from_json(self, filename):
+        with open(filename, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        self.model = defaultdict(Counter, {
+            tuple(k.split(",")): Counter(v) for k, v in data["model"].items()
+        })
+        self.starting_keys = [tuple(k.split(",")) for k in data["starting_keys"]]
+        print(f"Model loaded from {filename}")
+def train_and_save_model(filename_text, filename_json_model):
+    with open(filename_text, "r", encoding="utf-8") as f:
+        text = f.read()
+    chain = MarkovChain()
+    chain.train(text)
+    chain.save_to_json(filename_json_model)
+    return chain
+def load_model(filename_json_model):
+    chain = MarkovChain()
+    chain.load_from_json(filename_json_model)
+    return chain
+if __name__ == "__main__":
+    text_file = "training_data/WM.txt"
+    model_file = "AGWM.json"
+    if os.path.exists(model_file):
+        chain = load_model(model_file)
+    else:
+        chain = train_and_save_model(text_file, model_file)
+    print(chain.generate(min_sentences=3))

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,60 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+---
+# AgGPT-14
+<img src="banner.png" alt="AgGPT-14 Banner" width="100%">
+## Light. Pro. Smart.
+AgGPT-14 is our state of the art language model.
+# AgGPT-14: Advanced Generative Conversational AI
+AgGPT-14 is a lightweight, Python-based AI model designed for conversational tasks with context-aware responses. It combines n-gram style Markov chains with a similarity-driven context selection mechanism, providing coherent and human-like responses based on a training corpus.
+---
+## Features
+1. **Deterministic Context Matching**
+   - Uses an aggressive TF-IDF inspired similarity scoring combined with Longest Common Subsequence (LCS) detection to find the best matching user query from the training corpus.
+   - Ensures responses are relevant to the user's input.
+   - Ensures responses are relevant to the user's input.
+2. **World Model Integration**
+   - Generates simple "world model" text to enhance conversational depth when enabled.
+   - Can prepend or combine world model outputs with AI responses.
+3. **N-Gram AI Response Generation**
+   - Generates responses using an n-gram Markov model (configurable `order`) built from AI responses in the training corpus.
+   - Supports temperature-based sampling and top-k filtering for diverse outputs.
+4. **Text Normalization**
+   - Expands common contractions.
+   - Tokenizes text into clean, lowercase tokens.
+   - Detokenizes output with proper punctuation and capitalization.
+5. **IDF-Weighted Matching**
+   - Emphasizes rare words in similarity scoring to capture nuanced user queries.
+6. **Debugging Support**
+   - Provides detailed debug information about tokenization, similarity scores, and context selection for each user query.
+---
+# Notes
+Designed for offline usage; no external API calls required.
+Lightweight and fast; ideal for experimentation and educational purposes.
+Can be easily extended with more advanced NLP techniques for higher-quality responses.
+We noted that this model does not perform as well as traditional transformer-based models like GPT-3.5 or GPT-4, but it is designed to be lightweight. We also noted that this model is not as scalable, so further research and development is needed to improve its performance and scalability to match AgGPT-9 and AgGPT-10 performance, which is inherently more scalable but also more complex and resource-intensive, as it is a full transformer model. It is my goal to create a model that is lightweight, fast, and easy to use, while still providing high-quality responses, but also to make the model not be a black box, like most GPT models, so that it can be easily understood and modified by developers and researchers.
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

aggpt14.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import re
+import random
+from collections import Counter, defaultdict
+from training_data import corpus
+from AGWM import *
+ModelName = 'AgGPT-14'
+def world_model(length =10):
+    """Generates a simple world model for demonstration purposes."""
+    text_file = "training_data/WM.txt"
+    model_file = "AGWM.json"
+    if os.path.exists(model_file):
+        chain = load_model(model_file)
+    else:
+        chain = train_and_save_model(text_file, model_file)
+    return chain.generate(min_sentences=length)
+class AgGPT14:
+    def __init__(self, corpus_text, order=3, seed=None):
+        assert order >= 1, "order must be >= 1"
+        self.model_name = ModelName
+        self.order = order
+        self.rng = random.Random(seed)
+        self.pairs = self._parse_pairs(corpus_text)
+        if not self.pairs:
+            raise ValueError("No (user, ai) pairs found in corpus.")
+        self.user_docs = [self._tokenize(u) for u, _ in self.pairs]
+        self.ai_docs = [self._tokenize(a) for _, a in self.pairs]
+        self.idf_weights = self._calculate_idf(self.user_docs)
+        self.global_transitions = self._build_global_transitions(self.ai_docs)
+        self.unigram = self._build_unigram(self.ai_docs)
+        self.user_ai_pairs = list(zip(self.user_docs, self.ai_docs))
+    def _calculate_idf(self, docs):
+        """Calculates an aggressive IDF score to emphasize rare words."""
+        N = len(docs)
+        doc_freq = Counter()
+        for doc in docs:
+            for word in set(doc):
+                doc_freq[word] += 1
+        idf = {word: (N / (count + 1)) ** 2 for word, count in doc_freq.items()}
+        return idf
+    def _lcs(self, a, b):
+        """Finds the Longest Common Subsequence between two lists of tokens."""
+        lengths = [[0 for j in range(len(b) + 1)] for i in range(len(a) + 1)]
+        for i, x in enumerate(a):
+            for j, y in enumerate(b):
+                if x == y:
+                    lengths[i + 1][j + 1] = lengths[i][j] + 1
+                else:
+                    lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
+        result = []
+        x, y = len(a), len(b)
+        while x != 0 and y != 0:
+            if lengths[x][y] == lengths[x - 1][y]:
+                x -= 1
+            elif lengths[x][y] == lengths[x][y - 1]:
+                y -= 1
+            else:
+                result.append(a[x - 1])
+                x -= 1
+                y -= 1
+        return result[::-1]
+    def _parse_pairs(self, text):
+        pattern = re.compile(
+            r"user:\s*(.*?)\s*<pad>\s*ai:\s*(.*?)\s*<eos>",
+            re.DOTALL | re.IGNORECASE
+        )
+        pairs = []
+        for u, a in pattern.findall(text):
+            u, a = u.strip(), a.strip()
+            if u and a:
+                pairs.append((u, a))
+        return pairs
+    def _expand_contractions(self, s):
+        s = re.sub(r"what's", "what is", s)
+        s = re.sub(r"that's", "that is", s)
+        s = re.sub(r"it's", "it is", s)
+        s = re.sub(r"how's", "how is", s)
+        s = re.sub(r"he's", "he is", s)
+        s = re.sub(r"she's", "she is", s)
+        s = re.sub(r"you're", "you are", s)
+        s = re.sub(r"i'm", "i am", s)
+        s = re.sub(r"didn't", "did not", s)
+        s = re.sub(r"don't", "do not", s)
+        s = re.sub(r"can't", "cannot", s)
+        return s
+    def _tokenize(self, s):
+        s = s.strip().lower()
+        s = self._expand_contractions(s)
+        tokens = re.findall(r"[a-z]+(?:'[a-z]+)?|[?.!,;:]", s)
+        return [t for t in tokens if t]
+    def _with_bounds(self, tokens):
+        return ["<s>"] * self.order + tokens + ["</s>"]
+    def _similarity(self, query_tokens, doc_tokens):
+        if not query_tokens or not doc_tokens:
+            return 0.0
+        common_words = set(query_tokens).intersection(set(doc_tokens))
+        if not common_words:
+            return 0.0
+        idf_score = sum(self.idf_weights.get(word, 0.1) for word in common_words)
+        lcs = self._lcs(query_tokens, doc_tokens)
+        order_bonus_factor = 0.5
+        order_bonus = sum(self.idf_weights.get(word, 0.1) for word in lcs) * order_bonus_factor
+        return idf_score + order_bonus
+    def _find_best_match(self, user_text):
+        q_tokens = self._tokenize(user_text)
+        if not q_tokens:
+            return None
+        best_score = -1.0
+        best_idx = -1
+        for i, user_doc in enumerate(self.user_docs):
+            sim = self._similarity(q_tokens, user_doc)
+            if sim > best_score:
+                best_score = sim
+                best_idx = i
+        if best_idx == -1 or best_score < 0.1:
+             return None
+        return best_idx
+    def _build_global_transitions(self, docs):
+        trans = defaultdict(Counter)
+        for tokens in docs:
+            seq = self._with_bounds(tokens)
+            for i in range(len(seq) - self.order):
+                ctx = tuple(seq[i : i + self.order])
+                nxt = seq[i + self.order]
+                trans[ctx][nxt] += 1
+        return trans
+    def _build_unigram(self, docs):
+        uni = Counter()
+        for d in docs:
+            uni.update(d)
+        return uni
+    def _get_best_starting_context(self, user_text):
+        """Finds the best match and deterministically returns its starting context."""
+        best_match_idx = self._find_best_match(user_text)
+        if best_match_idx is not None:
+            ai_doc = self.ai_docs[best_match_idx]
+            if len(ai_doc) >= self.order:
+                return tuple(ai_doc[:self.order])
+        return tuple(["<s>"] * self.order)
+    def _sample_next(self, context, temperature, top_k):
+        ctx = context
+        while len(ctx) > 0:
+            if ctx in self.global_transitions and self.global_transitions[ctx]:
+                counter = self.global_transitions[ctx]
+                break
+            ctx = ctx[1:]
+        else:
+            counter = Counter({k: v for k, v in self.unigram.items() if k not in ["<s>", "</s>"]})
+        if not counter: return "</s>"
+        items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[:top_k]
+        if not items: return "</s>"
+        if temperature <= 0: return items[0][0]
+        tokens, weights = zip(*items)
+        scaled_weights = [w ** (1.0 / temperature) for w in weights]
+        return self.rng.choices(tokens, weights=scaled_weights, k=1)[0]
+    def _detokenize(self, tokens):
+        if not tokens: return ""
+        text = " ".join(t for t in tokens if t not in ["<s>", "</s>"])
+        text = re.sub(r'\s+([?.!,;:])', r'\1', text)
+        text = re.sub(r" ([']) ", r"\1", text)
+        if text: text = text[0].upper() + text[1:]
+        text = re.sub(r'([.!?]\s*)([a-z])', lambda m: m.group(1) + m.group(2).upper(), text)
+        text = re.sub(r'\bi\b', 'I', text)
+        return text
+    def respond(self, user_text, max_tokens=25, temperature=0.7, top_k=8, use_context_selection=True):
+        ctx = self._get_best_starting_context(user_text) if use_context_selection else tuple(["<s>"] * self.order)
+        out = list(ctx)
+        for _ in range(max_tokens):
+            nxt = self._sample_next(ctx, temperature, top_k)
+            if nxt == "</s>": break
+            out.append(nxt)
+            ctx = tuple(out[-self.order:])
+        return self._detokenize(out)
+    def ask(self, prompt, text_world_model=False, **kwargs):
+        """User-friendly wrapper for the respond method."""
+        response = self.respond(prompt, **kwargs)
+        if text_world_model:
+            wm_response = world_model(length=10)
+            wm_response = "<world_model>" + wm_response + "</world_model>"
+            response = wm_response + " " + response
+        return response
+    def get_debug_info(self, user_text):
+        q_tokens = self._tokenize(user_text)
+        print(f"--- Debug info for: '{user_text}' ---")
+        print(f"Query Tokens (after normalization): {q_tokens}\n")
+        best_match_idx = self._find_best_match(user_text)
+        if best_match_idx is not None:
+            best_score = self._similarity(q_tokens, self.user_docs[best_match_idx])
+            print("Determined Best Match:")
+            print(f"  - Corpus Entry: {' '.join(self.user_docs[best_match_idx])}")
+            print(f"  - Score: {best_score:.2f}")
+            print(f"  - Corresponding AI response will be used for context.")
+        else:
+            print("No suitable match found. Will use default starting context.")
+if __name__ == "__main__":
+    print(f"Initializing model: {ModelName}")
+    bot = AgGPT14(corpus, order=3, seed=42)
+    print("\n=== Demonstrating the Fix for 'color' query ===")
+    bot.get_debug_info("what is your favorite color?")
+    print("\n=== Testing Model with Deterministic Matching ===")
+    tests = [
+        "hi",
+        "tell me a joke",
+        "do you have hobbies?",
+        "what is your favorite color?",
+        "thanks a lot",
+    ]
+    for t in tests:
+        print(f"user: {t}")
+        response = bot.ask(t)
+        print(f"ai:   {response}")
+        print("-" * 40)
+    print("====WORLD MODEL====")
+    print(world_model())
+    prompt = "hello, how are you?"
+    print(f"\nPrompt: {prompt}")
+    response = bot.ask(prompt, max_tokens=20, temperature=0.5, top_k=5, text_world_model=True)
+    print(f"Response: {response}")

banner.png ADDED Viewed

Git LFS Details

SHA256: e1e0af349a46dbbc7fbabf9072b7075eb64accae7c3f85f952cfdf2a184261f5
Pointer size: 131 Bytes
Size of remote file: 519 kB

chat.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from aggpt14 import AgGPT14
+from training_data import corpus
+if __name__ == "__main__":
+    model = AgGPT14(corpus, order=4, seed=None)
+    print("Chat with AgGPT14 (type 'quit' to exit)")
+    print("-" * 40)
+    while True:
+        prompt = input("You: ")
+        if prompt.lower() in ['quit', 'exit', 'q']:
+            print("Goodbye!")
+            break
+        response = model.ask(prompt, max_tokens=999999, temperature=0.5, top_k=5, text_world_model=False)
+        print(f"AI:  {response}\n")

main.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from aggpt14 import AgGPT14
+from training_data import corpus
+if __name__ == "__main__":
+    model = AgGPT14(corpus, order=3, seed=None)
+    prompt = "What is your favorite color?"
+    print(f"User: {prompt}")
+    response = model.ask(prompt)
+    print(f"AI:   {response}")
+    prompt = "hello, how are you?"
+    print(f"\nPrompt: {prompt}")
+    response = model.ask(prompt, max_tokens=20, temperature=0.5, top_k=5, text_world_model=True)
+    print(f"Response: {response}")

training_data.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ with open("training_data/corpus.txt", "r", encoding="utf-8") as file:
2	+ corpus = file.read()

training_data/WM.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

training_data/corpus.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20c3209640ce4cf4efffa0ece52852e606e40c32892c1a65fe8ed46934b109b8
+size 49492881