AGofficial commited on
Commit
cb29702
·
verified ·
1 Parent(s): 381bafb

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +2 -0
  2. AgGPT16.feather +3 -0
  3. AgGPT16.py +200 -0
  4. AgGPT_Feather.py +71 -0
  5. README.md +49 -3
  6. banner.png +3 -0
  7. corpus.py +0 -0
  8. test_ai.py +40 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ AgGPT16.feather filter=lfs diff=lfs merge=lfs -text
37
+ banner.png filter=lfs diff=lfs merge=lfs -text
AgGPT16.feather ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3332d4caa675d1441a3174a1b3a531d52afb2e99954711b7de654761db403028
3
+ size 2043154
AgGPT16.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import random
3
+ import re
4
+ import os
5
+ import pandas as pd
6
+ from collections import defaultdict, Counter
7
+ from AgGPT_Feather import save_model, load_model
8
+
9
+ class AgGPT16:
10
+ def __init__(self, model_file='AgGPT16.feather', max_n=5, output_length=50):
11
+ self.model_name = 'AgGPT16'
12
+ self.model_file = model_file
13
+ self.max_n = max_n
14
+ self.output_length = output_length
15
+ self.vocabulary = set()
16
+ self.word_to_id = {}
17
+ self.id_to_word = {}
18
+ self.vocab_size = 0
19
+ self.models = self._load_or_train()
20
+
21
+ def _build_vocab_mapping(self):
22
+ if self.vocabulary:
23
+ vocab_list = sorted(list(self.vocabulary))
24
+ self.word_to_id = {word: i for i, word in enumerate(vocab_list)}
25
+ self.id_to_word = {i: word for i, word in enumerate(vocab_list)}
26
+ self.vocab_size = len(vocab_list)
27
+
28
+ def _words_to_ids(self, words):
29
+ return [self.word_to_id.get(word, 0) for word in words]
30
+
31
+ def _ids_to_words(self, ids):
32
+ return [self.id_to_word.get(id, '<UNK>') for id in ids]
33
+
34
+ @staticmethod
35
+ def _tokenize(text):
36
+ tokens = re.findall(r"<\|[\w\s]*\|>|\w+|[^\w\s]", text.lower())
37
+ return [token.strip() for token in tokens if token.strip()]
38
+
39
+ def _build_models(self, corpus_text):
40
+ print("Tokenizing...")
41
+ words = self._tokenize(corpus_text)
42
+ self.vocabulary = set(words)
43
+ self._build_vocab_mapping()
44
+
45
+ word_ids = self._words_to_ids(words)
46
+ models = defaultdict(lambda: defaultdict(Counter))
47
+ models[1] = Counter(word_ids)
48
+
49
+ print("Building n-grams...")
50
+ for n in range(2, self.max_n + 1):
51
+ for i in range(len(word_ids) - n + 1):
52
+ prefix = tuple(word_ids[i: i + n - 1])
53
+ suffix = word_ids[i + n - 1]
54
+ models[n][prefix][suffix] += 1
55
+
56
+ return models
57
+
58
+ def _predict_next_id(self, id_sequence):
59
+ if not id_sequence:
60
+ return 0
61
+
62
+ max_n = min(self.max_n, len(id_sequence) + 1)
63
+ for n in range(max_n, 1, -1):
64
+ if len(id_sequence) >= n - 1:
65
+ prefix = tuple(id_sequence[-(n - 1):])
66
+ candidates = self.models[n].get(prefix)
67
+ if candidates:
68
+ ids = list(candidates.keys())
69
+ weights = list(candidates.values())
70
+ total_weight = sum(weights)
71
+ r = random.random() * total_weight
72
+ cumulative = 0
73
+ for i, weight in enumerate(weights):
74
+ cumulative += weight
75
+ if r <= cumulative:
76
+ return ids[i]
77
+
78
+ if self.models[1]:
79
+ ids = list(self.models[1].keys())
80
+ weights = list(self.models[1].values())
81
+ total_weight = sum(weights)
82
+ if total_weight > 0:
83
+ r = random.random() * total_weight
84
+ cumulative = 0
85
+ for i, weight in enumerate(weights):
86
+ cumulative += weight
87
+ if r <= cumulative:
88
+ return ids[i]
89
+
90
+ return 0
91
+
92
+ def train(self, corpus_text):
93
+ print(f'Training {self.model_name}...')
94
+ cleaned_corpus = re.sub(r'[\r\n\s]+', ' ', corpus_text.strip())
95
+ self.models = self._build_models(cleaned_corpus)
96
+ save_model(self.models, self.model_file, self.word_to_id, self.id_to_word)
97
+ print(f'Training complete. Vocabulary: {self.vocab_size} words')
98
+
99
+ def _load_or_train(self):
100
+ if os.path.exists(self.model_file):
101
+ result = load_model(self.model_file)
102
+ if isinstance(result, tuple) and len(result) == 3:
103
+ models, word_to_id, id_to_word = result
104
+ self.word_to_id = word_to_id
105
+ self.id_to_word = id_to_word
106
+ self.vocabulary = set(word_to_id.keys())
107
+ self.vocab_size = len(self.vocabulary)
108
+ return models
109
+ else:
110
+ return result
111
+ else:
112
+ from corpus import corpus
113
+ self.train(corpus)
114
+ return self.models
115
+
116
+ def generate_response(self, input_text):
117
+ tokens = self._tokenize(input_text.lower())
118
+ if not tokens:
119
+ return "Please say something."
120
+
121
+ input_ids = self._words_to_ids(tokens)
122
+ generated_ids = []
123
+ current_ids = input_ids[-20:] if len(input_ids) > 20 else input_ids
124
+
125
+ for i in range(min(self.output_length, 80)):
126
+ next_id = self._predict_next_id(current_ids)
127
+ if next_id == 0:
128
+ break
129
+
130
+ generated_ids.append(next_id)
131
+ current_ids.append(next_id)
132
+
133
+ if len(current_ids) > 20:
134
+ current_ids = current_ids[-20:]
135
+
136
+ if len(generated_ids) >= 3 and len(set(generated_ids[-3:])) == 1:
137
+ break
138
+
139
+ end_token_id = self.word_to_id.get('<|endoftext|>', -1)
140
+ if end_token_id != -1 and next_id == end_token_id:
141
+ break
142
+
143
+ if i > 10:
144
+ period_id = self.word_to_id.get('.', -1)
145
+ exclaim_id = self.word_to_id.get('!', -1)
146
+ question_id = self.word_to_id.get('?', -1)
147
+ if next_id in [period_id, exclaim_id, question_id]:
148
+ break
149
+
150
+ if not generated_ids:
151
+ return "I'm not sure how to respond."
152
+
153
+ response_words = self._ids_to_words(generated_ids)
154
+ response = ' '.join(response_words)
155
+ response = re.sub(r'\s+', ' ', response)
156
+ response = re.sub(r'\s+([,.!?;:])', r'\1', response)
157
+ response = re.sub(r'<\|endoftext\|>', '', response)
158
+
159
+ if response and response[0].islower():
160
+ response = response[0].upper() + response[1:]
161
+
162
+ return response.strip()
163
+
164
+ def ask(prompt: str) -> str:
165
+ if not prompt.strip():
166
+ return "Please ask me something!"
167
+
168
+ formatted_prompt = "user: " + prompt.strip() + " ai: "
169
+
170
+ if not hasattr(ask, 'model'):
171
+ ask.model = AgGPT16()
172
+
173
+ model = ask.model
174
+ response = model.generate_response(formatted_prompt)
175
+
176
+ if '<|endoftext|>' in response:
177
+ response = response.split('<|endoftext|>')[0]
178
+
179
+ response = re.sub(r'^\s*(ai|user)\s*:\s*', '', response, flags=re.IGNORECASE)
180
+ response = response.strip()
181
+
182
+ if not response or len(response.strip()) < 2:
183
+ fallback_responses = [
184
+ "Could you rephrase that?",
185
+ "Tell me more.",
186
+ "I'm not sure I understand.",
187
+ "Let me think about that."
188
+ ]
189
+ response = random.choice(fallback_responses)
190
+
191
+ return response
192
+
193
+ if __name__ == "__main__":
194
+ while True:
195
+ user_input = input("You: ")
196
+ if user_input.lower() in {'exit', 'quit'}:
197
+ print("Goodbye!")
198
+ break
199
+ reply = ask(user_input)
200
+ print(f"AI: {reply}")
AgGPT_Feather.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from collections import defaultdict, Counter
3
+
4
+ def save_model(models, model_file, word_to_id, id_to_word):
5
+ print(f"Saving model to {model_file}...")
6
+ model_data = []
7
+
8
+ vocab_data = []
9
+ for word, word_id in word_to_id.items():
10
+ vocab_data.append({'word': word, 'id': word_id})
11
+
12
+ if 1 in models:
13
+ for word_id, count in models[1].items():
14
+ model_data.append({'n': 1, 'prefix': '_UNIGRAM_', 'suffix': word_id, 'count': count})
15
+
16
+ for n, prefixes in models.items():
17
+ if n > 1:
18
+ for prefix, counter in prefixes.items():
19
+ prefix_str = ' '.join(map(str, prefix))
20
+ for suffix, count in counter.items():
21
+ model_data.append({
22
+ 'n': n, 'prefix': prefix_str, 'suffix': suffix, 'count': count
23
+ })
24
+
25
+ df_model = pd.DataFrame(model_data)
26
+ df_vocab = pd.DataFrame(vocab_data)
27
+
28
+ combined_df = pd.concat([
29
+ df_model.assign(data_type='model'),
30
+ df_vocab.assign(data_type='vocab')
31
+ ], ignore_index=True)
32
+
33
+ combined_df.to_feather(model_file)
34
+ print("Model saved successfully.")
35
+
36
+ def load_model(model_file):
37
+ print(f"Loading model from {model_file}...")
38
+ df = pd.read_feather(model_file)
39
+
40
+ models = defaultdict(lambda: defaultdict(Counter))
41
+ word_to_id = {}
42
+ id_to_word = {}
43
+
44
+ if 'data_type' in df.columns:
45
+ vocab_df = df[df['data_type'] == 'vocab']
46
+ for _, row in vocab_df.iterrows():
47
+ word = row['word']
48
+ word_id = row['id']
49
+ word_to_id[word] = word_id
50
+ id_to_word[word_id] = word
51
+
52
+ model_df = df[df['data_type'] == 'model']
53
+ else:
54
+ model_df = df
55
+
56
+ unigram_df = model_df[model_df['n'] == 1]
57
+ for _, row in unigram_df.iterrows():
58
+ models[1][row['suffix']] = row['count']
59
+
60
+ ngram_df = model_df[model_df['n'] > 1]
61
+ for _, row in ngram_df.iterrows():
62
+ n, prefix_str, suffix, count = row['n'], row['prefix'], row['suffix'], row['count']
63
+ prefix = tuple(map(int, prefix_str.split()))
64
+ models[n][prefix][suffix] += count
65
+
66
+ print("Model loaded successfully.")
67
+
68
+ if word_to_id and id_to_word:
69
+ return models, word_to_id, id_to_word
70
+ else:
71
+ return models
README.md CHANGED
@@ -1,3 +1,49 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <img src="banner.png" alt="AgGPT Banner" width="600"/>
2
+
3
+ # AgGPT-16
4
+
5
+ An very light language model that can be scaled and improved easily. Built with advanced attention mechanisms, context awareness, and quality control features to deliver coherent and contextually relevant responses.
6
+
7
+ ## Quick Start
8
+
9
+ ### Basic Usage
10
+ ```python
11
+ from AgGPT16 import ask
12
+
13
+ response = ask("Hello, how are you today?")
14
+ print(response)
15
+ ```
16
+
17
+
18
+ ## 🔧 Configuration Options
19
+
20
+ ```python
21
+ ai = AgGPT16(
22
+ model_file='custom_model.feather', # Model save location
23
+ max_n=5, # Maximum n-gram size
24
+ output_length=150 # Max response length
25
+ )
26
+ ```
27
+
28
+ ## 📊 Training Data Format
29
+
30
+ The model expects conversation data in this format:
31
+ ```
32
+ user: [user message]
33
+ ai: [ai response] <|endoftext|>
34
+ ```
35
+
36
+ ## 🚫 Limitations
37
+
38
+ - Training time scales with corpus size
39
+ - Memory usage increases with vocabulary size
40
+ - Response quality depends on training data quality
41
+ - No external knowledge beyond training corpus
42
+
43
+ ## 🤝 Contributing
44
+
45
+ This is an educational/research project. Feel free to experiment and improve upon the architecture!
46
+
47
+ ## 📝 License
48
+
49
+ Open source - feel free to use and modify.
banner.png ADDED

Git LFS Details

  • SHA256: 4dc02b72d882da9eab1b392b3d8649b5c49998880b87c1549f6038463e45956c
  • Pointer size: 132 Bytes
  • Size of remote file: 3.95 MB
corpus.py ADDED
The diff for this file is too large to render. See raw diff
 
test_ai.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for AgGPT16
3
+ """
4
+
5
+ from AgGPT16 import ask
6
+ import time
7
+
8
+ def test_ai():
9
+ """Test the AI with various prompts"""
10
+ print("Testing AgGPT16 AI")
11
+ print("=" * 50)
12
+
13
+ test_prompts = [
14
+ "Hello, how are you?",
15
+ "What is Python?",
16
+ "Tell me about machine learning",
17
+ "I'm feeling sad today",
18
+ "What's your favorite color?",
19
+ "Can you help me with coding?",
20
+ ]
21
+
22
+ for i, prompt in enumerate(test_prompts, 1):
23
+ print(f"\n{i}. USER: {prompt}")
24
+ print("-" * 40)
25
+
26
+ start_time = time.time()
27
+ try:
28
+ response = ask(prompt)
29
+ end_time = time.time()
30
+
31
+ print(f"AI: {response}")
32
+ print(f"⏱️ Response time: {end_time - start_time:.2f}s")
33
+
34
+ except Exception as e:
35
+ print(f"❌ Error: {e}")
36
+
37
+ print()
38
+
39
+ if __name__ == "__main__":
40
+ test_ai()