AGofficial commited on
Commit
8d67672
·
verified ·
1 Parent(s): ce4ad48

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. chat.py +321 -0
  3. feather.py +196 -0
  4. train.py +298 -0
  5. training_data/corpora.txt +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  banner.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  banner.png filter=lfs diff=lfs merge=lfs -text
37
+ training_data/corpora.txt filter=lfs diff=lfs merge=lfs -text
chat.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import random
4
+ from typing import List, Dict, Tuple, Any
5
+ from collections import defaultdict
6
+ import math
7
+
8
+ from feather import FeatherManager, similarity_score
9
+ from train import GrammarRules, PatternExtractor
10
+
11
+
12
+ class ResponseGenerator:
13
+
14
+ def __init__(self, feather_manager: FeatherManager):
15
+ self.feather_manager = feather_manager
16
+ self.pattern_extractor = PatternExtractor()
17
+ self.grammar_rules = GrammarRules()
18
+ self.models = []
19
+ self.context_window = []
20
+ self.max_context_length = 10
21
+
22
+ def load_models(self):
23
+ print("Loading mini-models...")
24
+ self.models = self.feather_manager.load_all_models()
25
+ print(f"Loaded {len(self.models)} mini-models")
26
+
27
+ if not self.models:
28
+ print("No trained models found! Please run train.py first.")
29
+ return False
30
+
31
+ return True
32
+
33
+ def calculate_model_scores(self, user_input: str) -> List[Tuple[Dict[str, Any], float]]:
34
+ if not self.models:
35
+ return []
36
+
37
+ input_pattern = self.pattern_extractor.create_pattern(user_input)
38
+ input_keywords = set(self.pattern_extractor.extract_keywords(user_input))
39
+
40
+ model_scores = []
41
+
42
+ for model in self.models:
43
+ score = 0.0
44
+ pattern_matches = 0
45
+ keyword_matches = 0
46
+
47
+ for pattern in model.get('patterns', []):
48
+ pattern_sim = self.pattern_extractor.calculate_pattern_similarity(input_pattern, pattern)
49
+ score += pattern_sim
50
+ if pattern_sim > 0.3:
51
+ pattern_matches += 1
52
+
53
+ model_keywords = set(model.get('keywords', []))
54
+ if model_keywords and input_keywords:
55
+ keyword_overlap = len(input_keywords.intersection(model_keywords))
56
+ keyword_total = len(input_keywords.union(model_keywords))
57
+ keyword_score = keyword_overlap / keyword_total if keyword_total > 0 else 0
58
+ score += keyword_score * 2
59
+ keyword_matches = keyword_overlap
60
+
61
+ confidence = model.get('confidence', 0.5)
62
+ score *= confidence
63
+
64
+ training_samples = model.get('training_samples', 1)
65
+ training_bonus = min(0.2, training_samples / 100)
66
+ score += training_bonus
67
+
68
+ context_bonus = self._calculate_context_bonus(user_input, model)
69
+ score += context_bonus
70
+
71
+ model_scores.append((model, score))
72
+
73
+ model_scores.sort(key=lambda x: x[1], reverse=True)
74
+
75
+ return model_scores
76
+
77
+ def _calculate_context_bonus(self, user_input: str, model: Dict[str, Any]) -> float:
78
+ if not self.context_window:
79
+ return 0.0
80
+
81
+ context_bonus = 0.0
82
+
83
+ for prev_input, prev_response in self.context_window[-3:]:
84
+ for pattern in model.get('patterns', [])[:5]:
85
+ pattern_sim = similarity_score(prev_input, pattern.strip())
86
+ context_bonus += pattern_sim * 0.1
87
+
88
+ return min(context_bonus, 0.3)
89
+
90
+ def select_top_models(self, model_scores: List[Tuple[Dict[str, Any], float]], top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
91
+ valid_models = [(model, score) for model, score in model_scores if score > 0.01]
92
+
93
+ if not valid_models:
94
+ valid_models = random.sample(model_scores, min(3, len(model_scores)))
95
+
96
+ return valid_models[:top_k]
97
+
98
+ def generate_responses_from_models(self, user_input: str, top_models: List[Tuple[Dict[str, Any], float]]) -> List[Tuple[str, float]]:
99
+ responses = []
100
+ input_pattern = self.pattern_extractor.create_pattern(user_input)
101
+
102
+ for model, model_score in top_models:
103
+ model_responses = []
104
+ best_similarity = 0.0
105
+
106
+ patterns = model.get('patterns', [])
107
+ model_responses_list = model.get('responses', [])
108
+
109
+ if not patterns or not model_responses_list:
110
+ continue
111
+
112
+ best_matches = []
113
+ for i, pattern in enumerate(patterns):
114
+ if i < len(model_responses_list):
115
+ sim = self.pattern_extractor.calculate_pattern_similarity(input_pattern, pattern)
116
+ if sim > 0.1:
117
+ best_matches.append((model_responses_list[i], sim))
118
+
119
+ best_matches.sort(key=lambda x: x[1], reverse=True)
120
+
121
+ selected_responses = best_matches[:3] if best_matches else [(random.choice(model_responses_list), 0.1)]
122
+
123
+ for response, pattern_sim in selected_responses:
124
+ weight = model_score * (0.7 + pattern_sim * 0.3)
125
+ responses.append((response, weight))
126
+
127
+ return responses
128
+
129
+ def combine_responses(self, responses: List[Tuple[str, float]]) -> str:
130
+ if not responses:
131
+ return "I'm not sure how to respond to that."
132
+
133
+ filtered_responses = [(resp, weight) for resp, weight in responses if weight > 0.05]
134
+ if not filtered_responses:
135
+ filtered_responses = responses[:1]
136
+
137
+ response_groups = defaultdict(list)
138
+ for response, weight in filtered_responses:
139
+ key = ' '.join(response.split()[:3]).lower()
140
+ response_groups[key].append((response, weight))
141
+
142
+ best_responses = []
143
+ for group in response_groups.values():
144
+ best_resp, best_weight = max(group, key=lambda x: x[1])
145
+ best_responses.append((best_resp, best_weight))
146
+
147
+ if len(best_responses) > 1:
148
+ total_weight = sum(weight for _, weight in best_responses)
149
+ if total_weight > 0:
150
+ normalized_weights = [weight / total_weight for _, weight in best_responses]
151
+
152
+ rand_val = random.random()
153
+ cumsum = 0.0
154
+ for i, norm_weight in enumerate(normalized_weights):
155
+ cumsum += norm_weight
156
+ if rand_val <= cumsum:
157
+ selected_response = best_responses[i][0]
158
+ break
159
+ else:
160
+ selected_response = best_responses[0][0]
161
+ else:
162
+ selected_response = best_responses[0][0]
163
+ else:
164
+ selected_response = best_responses[0][0]
165
+
166
+ final_response = selected_response
167
+
168
+ if not final_response.endswith('<eos>'):
169
+ final_response += ' <eos>'
170
+
171
+ return final_response
172
+
173
+ def generate_response(self, user_input: str) -> str:
174
+ if not user_input.strip():
175
+ return "Please say something! <eos>"
176
+
177
+ model_scores = self.calculate_model_scores(user_input)
178
+
179
+ if not model_scores:
180
+ return "I need to learn more before I can respond properly. <eos>"
181
+
182
+ top_models = self.select_top_models(model_scores, top_k=5)
183
+
184
+ responses = self.generate_responses_from_models(user_input, top_models)
185
+
186
+ final_response = self.combine_responses(responses)
187
+
188
+ self.context_window.append((user_input, final_response))
189
+ if len(self.context_window) > self.max_context_length:
190
+ self.context_window.pop(0)
191
+
192
+ return final_response
193
+
194
+ def get_model_statistics(self) -> Dict[str, Any]:
195
+ if not self.models:
196
+ return {"total_models": 0}
197
+
198
+ total_patterns = sum(len(model.get('patterns', [])) for model in self.models)
199
+ total_responses = sum(len(model.get('responses', [])) for model in self.models)
200
+ avg_confidence = sum(model.get('confidence', 0) for model in self.models) / len(self.models)
201
+ total_training_samples = sum(model.get('training_samples', 0) for model in self.models)
202
+
203
+ return {
204
+ "total_models": len(self.models),
205
+ "total_patterns": total_patterns,
206
+ "total_responses": total_responses,
207
+ "average_confidence": avg_confidence,
208
+ "total_training_samples": total_training_samples
209
+ }
210
+
211
+
212
+ class AgGPTChat:
213
+
214
+ def __init__(self, models_dir: str = "models"):
215
+ self.feather_manager = FeatherManager(models_dir)
216
+ self.response_generator = ResponseGenerator(self.feather_manager)
217
+ self.conversation_history = []
218
+
219
+ def initialize(self) -> bool:
220
+ print("AgGPT-17 Scalable Feather Architecture Chat")
221
+ print("=" * 50)
222
+
223
+ success = self.response_generator.load_models()
224
+ if success:
225
+ stats = self.response_generator.get_model_statistics()
226
+ print(f"Model Statistics:")
227
+ print(f" Mini-models loaded: {stats['total_models']}")
228
+ print(f" Total patterns: {stats['total_patterns']}")
229
+ print(f" Total responses: {stats['total_responses']}")
230
+ print(f" Average confidence: {stats['average_confidence']:.3f}")
231
+ print(f" Training samples: {stats['total_training_samples']}")
232
+ print("=" * 50)
233
+ print("Chat initialized! Type 'quit' to exit.")
234
+ print("Large context window active - I'll remember our conversation!")
235
+ print()
236
+
237
+ return success
238
+
239
+ def chat_loop(self):
240
+ if not self.initialize():
241
+ return
242
+
243
+ while True:
244
+ try:
245
+ user_input = input("You: ").strip()
246
+
247
+ if not user_input:
248
+ continue
249
+
250
+ if user_input.lower() in ['quit', 'exit', 'bye', 'goodbye']:
251
+ print("AgGPT: Goodbye! Thanks for chatting with me! <eos>")
252
+ break
253
+
254
+ if user_input.lower() in ['stats', 'statistics']:
255
+ stats = self.response_generator.get_model_statistics()
256
+ print("Current Statistics:")
257
+ for key, value in stats.items():
258
+ print(f" {key}: {value}")
259
+ continue
260
+
261
+ if user_input.lower() in ['clear', 'reset']:
262
+ self.response_generator.context_window = []
263
+ print("Context cleared!")
264
+ continue
265
+
266
+ print("AgGPT: ", end="", flush=True)
267
+ response = self.response_generator.generate_response(user_input)
268
+
269
+ display_response = response.replace(' <eos>', '').replace('<eos>', '')
270
+ print(display_response)
271
+ print()
272
+
273
+ self.conversation_history.append({
274
+ 'user': user_input,
275
+ 'assistant': display_response
276
+ })
277
+
278
+ except KeyboardInterrupt:
279
+ print("\n\nAgGPT: Chat interrupted. Goodbye!")
280
+ break
281
+ except Exception as e:
282
+ print(f"\nError: {e}")
283
+ print("Let me try again...")
284
+ continue
285
+
286
+ def batch_test(self, test_inputs: List[str]):
287
+ if not self.initialize():
288
+ return
289
+
290
+ print("Running batch test...")
291
+ print("=" * 50)
292
+
293
+ for i, test_input in enumerate(test_inputs, 1):
294
+ print(f"Test {i}: {test_input}")
295
+ response = self.response_generator.generate_response(test_input)
296
+ display_response = response.replace(' <eos>', '').replace('<eos>', '')
297
+ print(f"Response: {display_response}")
298
+ print("-" * 30)
299
+
300
+
301
+ def main():
302
+ chat = AgGPTChat()
303
+
304
+ import sys
305
+ if len(sys.argv) > 1 and sys.argv[1] == "test":
306
+ test_inputs = [
307
+ "hi",
308
+ "hello there",
309
+ "how are you?",
310
+ "what's your favorite color?",
311
+ "tell me a joke",
312
+ "thank you",
313
+ "goodbye"
314
+ ]
315
+ chat.batch_test(test_inputs)
316
+ else:
317
+ chat.chat_loop()
318
+
319
+
320
+ if __name__ == "__main__":
321
+ main()
feather.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from typing import Dict, List, Any, Optional
4
+ import math
5
+
6
+
7
+ class FeatherManager:
8
+
9
+ def __init__(self, models_dir: str = "models"):
10
+ self.models_dir = models_dir
11
+ os.makedirs(models_dir, exist_ok=True)
12
+
13
+ def save_mini_model(self, model_data: Dict[str, Any], model_id: int) -> str:
14
+ filename = f"AgGPT_Expert_{model_id:04d}.feather"
15
+ filepath = os.path.join(self.models_dir, filename)
16
+
17
+ patterns = model_data.get('patterns', [])
18
+ responses = model_data.get('responses', [])
19
+
20
+ if not patterns or not responses:
21
+ print(f"Warning: Model {model_id} has empty patterns or responses")
22
+ patterns = patterns or ['hello']
23
+ responses = responses or ['Hello!']
24
+
25
+ df_data = {
26
+ 'patterns': [str(pattern) for pattern in patterns],
27
+ 'responses': [str(response) for response in responses],
28
+ 'weights': model_data.get('weights', [1.0] * len(patterns)),
29
+ 'confidence': [model_data.get('confidence', 0.5)] * len(patterns),
30
+ 'grammar_rules': [str(rule) for rule in model_data.get('grammar_rules', [])] or ['none'],
31
+ 'keywords': [' '.join(model_data.get('keywords', []))] * len(patterns),
32
+ 'training_samples': [model_data.get('training_samples', 0)] * len(patterns)
33
+ }
34
+
35
+ max_len = max(len(v) if isinstance(v, list) else 1 for v in df_data.values())
36
+
37
+ for key, value in df_data.items():
38
+ if isinstance(value, list):
39
+ while len(value) < max_len:
40
+ value.append(value[-1] if value else '')
41
+
42
+ df = pd.DataFrame(df_data)
43
+ df.to_feather(filepath)
44
+
45
+ print(f"Saved mini-model: {filename}")
46
+ return filepath
47
+
48
+ def load_mini_model(self, model_id: int) -> Optional[Dict[str, Any]]:
49
+ filename = f"AgGPT_Expert_{model_id:04d}.feather"
50
+ filepath = os.path.join(self.models_dir, filename)
51
+
52
+ if not os.path.exists(filepath):
53
+ return None
54
+
55
+ try:
56
+ df = pd.read_feather(filepath)
57
+
58
+ model_data = {
59
+ 'patterns': [p for p in df['patterns'].tolist() if p],
60
+ 'responses': [r for r in df['responses'].tolist() if r],
61
+ 'weights': df['weights'].tolist(),
62
+ 'confidence': df['confidence'].iloc[0] if len(df) > 0 else 0.5,
63
+ 'grammar_rules': [rule for rule in df['grammar_rules'].tolist() if rule],
64
+ 'keywords': df['keywords'].iloc[0].split() if len(df) > 0 and df['keywords'].iloc[0] else [],
65
+ 'training_samples': df['training_samples'].iloc[0] if len(df) > 0 else 0,
66
+ 'model_id': model_id
67
+ }
68
+
69
+ return model_data
70
+
71
+ except Exception as e:
72
+ print(f"Error loading model {model_id}: {e}")
73
+ return None
74
+
75
+ def load_all_models(self) -> List[Dict[str, Any]]:
76
+ models = []
77
+
78
+ if not os.path.exists(self.models_dir):
79
+ return models
80
+
81
+ for filename in os.listdir(self.models_dir):
82
+ if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"):
83
+ try:
84
+ model_id = int(filename.split("_")[2].split(".")[0])
85
+ model = self.load_mini_model(model_id)
86
+ if model:
87
+ models.append(model)
88
+ except (ValueError, IndexError):
89
+ print(f"Warning: Invalid model filename format: {filename}")
90
+ continue
91
+
92
+ return models
93
+
94
+ def get_model_count(self) -> int:
95
+ if not os.path.exists(self.models_dir):
96
+ return 0
97
+
98
+ count = 0
99
+ for filename in os.listdir(self.models_dir):
100
+ if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"):
101
+ count += 1
102
+
103
+ return count
104
+
105
+ def get_next_model_id(self) -> int:
106
+ if not os.path.exists(self.models_dir):
107
+ return 1
108
+
109
+ max_id = 0
110
+ for filename in os.listdir(self.models_dir):
111
+ if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"):
112
+ try:
113
+ model_id = int(filename.split("_")[2].split(".")[0])
114
+ max_id = max(max_id, model_id)
115
+ except (ValueError, IndexError):
116
+ continue
117
+
118
+ return max_id + 1
119
+
120
+ def delete_model(self, model_id: int) -> bool:
121
+ filename = f"AgGPT_Expert_{model_id:04d}.feather"
122
+ filepath = os.path.join(self.models_dir, filename)
123
+
124
+ if os.path.exists(filepath):
125
+ try:
126
+ os.remove(filepath)
127
+ print(f"Deleted model: {filename}")
128
+ return True
129
+ except Exception as e:
130
+ print(f"Error deleting model {model_id}: {e}")
131
+ return False
132
+
133
+ return False
134
+
135
+ def clear_all_models(self) -> int:
136
+ if not os.path.exists(self.models_dir):
137
+ return 0
138
+
139
+ deleted_count = 0
140
+ for filename in os.listdir(self.models_dir):
141
+ if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"):
142
+ try:
143
+ os.remove(os.path.join(self.models_dir, filename))
144
+ deleted_count += 1
145
+ except Exception as e:
146
+ print(f"Error deleting {filename}: {e}")
147
+
148
+ print(f"Deleted {deleted_count} model files")
149
+ return deleted_count
150
+
151
+
152
+ def similarity_score(text1: str, text2: str) -> float:
153
+ if not text1 or not text2:
154
+ return 0.0
155
+
156
+ words1 = set(text1.lower().split())
157
+ words2 = set(text2.lower().split())
158
+
159
+ if not words1 or not words2:
160
+ return 0.0
161
+
162
+ intersection = len(words1.intersection(words2))
163
+ union = len(words1.union(words2))
164
+
165
+ return intersection / union if union > 0 else 0.0
166
+
167
+
168
+ def calculate_confidence_score(patterns: List[str], responses: List[str]) -> float:
169
+ if not patterns or not responses or len(patterns) != len(responses):
170
+ return 0.1
171
+
172
+ base_confidence = min(0.9, len(patterns) / 10.0)
173
+
174
+ return max(0.1, min(1.0, base_confidence))
175
+
176
+
177
+ if __name__ == "__main__":
178
+ manager = FeatherManager()
179
+
180
+ test_model = {
181
+ 'patterns': ['hello', 'hi', 'hey'],
182
+ 'responses': ['Hello! How can I help you?', 'Hi there!', 'Hey! What\'s up?'],
183
+ 'weights': [1.0, 0.9, 0.8],
184
+ 'confidence': 0.8,
185
+ 'grammar_rules': ['capitalize_first_word', 'end_with_punctuation'],
186
+ 'keywords': ['greeting', 'hello', 'hi'],
187
+ 'training_samples': 150
188
+ }
189
+
190
+ model_id = manager.get_next_model_id()
191
+ manager.save_mini_model(test_model, model_id)
192
+ loaded_model = manager.load_mini_model(model_id)
193
+
194
+ print(f"Original model: {test_model}")
195
+ print(f"Loaded model: {loaded_model}")
196
+ print(f"Models count: {manager.get_model_count()}")
train.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import random
4
+ import math
5
+ from typing import List, Dict, Tuple, Set, Any
6
+ from collections import defaultdict, Counter
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ from feather import FeatherManager, similarity_score, calculate_confidence_score
11
+
12
+ class GrammarRules:
13
+
14
+ @staticmethod
15
+ def apply_all_rules(text: str) -> str:
16
+ if not text:
17
+ return text
18
+
19
+ return text.strip()
20
+
21
+ class PatternExtractor:
22
+
23
+ def __init__(self):
24
+ self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall'}
25
+
26
+ def extract_keywords(self, text: str) -> List[str]:
27
+ if not text:
28
+ return []
29
+
30
+ words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
31
+
32
+ keywords = [word for word in words if word not in self.stop_words and len(word) > 2]
33
+
34
+ return list(set(keywords))
35
+
36
+ def create_pattern(self, user_input: str) -> str:
37
+ if not user_input:
38
+ return ""
39
+
40
+ pattern = re.sub(r'\s+', ' ', user_input.strip().lower())
41
+
42
+ pattern = f" {pattern} "
43
+
44
+ return pattern
45
+
46
+ def calculate_pattern_similarity(self, pattern1: str, pattern2: str) -> float:
47
+ return similarity_score(pattern1.strip(), pattern2.strip())
48
+
49
+
50
+ class MiniModelTrainer:
51
+
52
+ def __init__(self, feather_manager: FeatherManager):
53
+ self.feather_manager = feather_manager
54
+ self.pattern_extractor = PatternExtractor()
55
+ self.grammar_rules = GrammarRules()
56
+
57
+ def train_mini_model(self, training_pairs: List[Tuple[str, str]], confidence_threshold: float = 0.1) -> Dict[str, Any]:
58
+ if not training_pairs or len(training_pairs) < 2:
59
+ return None
60
+
61
+ patterns = []
62
+ responses = []
63
+ weights = []
64
+ all_keywords = []
65
+
66
+ for user_input, ai_response in training_pairs:
67
+ processed_response = ai_response.strip()
68
+
69
+ pattern = self.pattern_extractor.create_pattern(user_input)
70
+
71
+ keywords = self.pattern_extractor.extract_keywords(user_input)
72
+ all_keywords.extend(keywords)
73
+
74
+ patterns.append(pattern)
75
+ responses.append(processed_response)
76
+ weights.append(1.0)
77
+
78
+ confidence = min(0.9, len(training_pairs) / 20.0)
79
+
80
+ keyword_counter = Counter(all_keywords)
81
+ top_keywords = [word for word, count in keyword_counter.most_common(10)]
82
+
83
+ mini_model = {
84
+ 'patterns': patterns,
85
+ 'responses': responses,
86
+ 'weights': weights,
87
+ 'confidence': confidence,
88
+ 'grammar_rules': [],
89
+ 'keywords': top_keywords,
90
+ 'training_samples': len(training_pairs)
91
+ }
92
+
93
+ return mini_model
94
+
95
+ def should_merge_models(self, model1: Dict[str, Any], model2: Dict[str, Any], merge_threshold: float = 0.8) -> bool:
96
+ keywords1 = set(model1.get('keywords', []))
97
+ keywords2 = set(model2.get('keywords', []))
98
+
99
+ if not keywords1 or not keywords2:
100
+ return False
101
+
102
+ keyword_similarity = len(keywords1.intersection(keywords2)) / len(keywords1.union(keywords2))
103
+
104
+ responses1 = model1.get('responses', [])
105
+ responses2 = model2.get('responses', [])
106
+
107
+ response_similarities = []
108
+ for r1 in responses1[:5]:
109
+ for r2 in responses2[:5]:
110
+ sim = similarity_score(r1, r2)
111
+ response_similarities.append(sim)
112
+
113
+ avg_response_similarity = sum(response_similarities) / len(response_similarities) if response_similarities else 0
114
+
115
+ min_confidence = min(model1.get('confidence', 0), model2.get('confidence', 0))
116
+
117
+ return (keyword_similarity > merge_threshold and
118
+ avg_response_similarity > merge_threshold and
119
+ min_confidence > 0.7)
120
+
121
+ def merge_mini_models(self, model1: Dict[str, Any], model2: Dict[str, Any]) -> Dict[str, Any]:
122
+ merged_model = {
123
+ 'patterns': model1.get('patterns', []) + model2.get('patterns', []),
124
+ 'responses': model1.get('responses', []) + model2.get('responses', []),
125
+ 'weights': model1.get('weights', []) + model2.get('weights', []),
126
+ 'confidence': (model1.get('confidence', 0) + model2.get('confidence', 0)) / 2,
127
+ 'grammar_rules': list(set(model1.get('grammar_rules', []) + model2.get('grammar_rules', []))),
128
+ 'keywords': list(set(model1.get('keywords', []) + model2.get('keywords', []))),
129
+ 'training_samples': model1.get('training_samples', 0) + model2.get('training_samples', 0)
130
+ }
131
+
132
+ return merged_model
133
+
134
+
135
+ class AgGPTTrainer:
136
+
137
+ def __init__(self, models_dir: str = "models"):
138
+ self.feather_manager = FeatherManager(models_dir)
139
+ self.mini_trainer = MiniModelTrainer(self.feather_manager)
140
+ self.target_size_mb = 5
141
+ self.estimated_size_per_pair = 1000
142
+ self.chunk_size = (self.target_size_mb * 1024 * 1024) // self.estimated_size_per_pair
143
+
144
+ def load_training_data(self, file_path: str) -> List[Tuple[str, str]]:
145
+ training_pairs = []
146
+
147
+ with open(file_path, 'r', encoding='utf-8') as f:
148
+ content = f.read()
149
+
150
+ conversations = content.split('<eos>')
151
+
152
+ print(f"Processing {len(conversations)} conversation chunks...")
153
+
154
+ for conversation in tqdm(conversations, desc="Parsing conversations"):
155
+ conversation = conversation.strip()
156
+ if not conversation:
157
+ continue
158
+
159
+ user_match = re.search(r'user:\s*(.*?)(?=\n<pad>|\nai:|$)', conversation, re.DOTALL)
160
+ ai_match = re.search(r'ai:\s*(.*?)$', conversation, re.DOTALL)
161
+
162
+ if user_match and ai_match:
163
+ user_input = user_match.group(1).strip()
164
+ ai_response = ai_match.group(1).strip()
165
+
166
+ user_input = re.sub(r'<pad>', '', user_input).strip()
167
+ ai_response = re.sub(r'<pad>', '', ai_response).strip()
168
+
169
+ if user_input and ai_response and len(user_input) > 0 and len(ai_response) > 0:
170
+ training_pairs.append((user_input, ai_response))
171
+
172
+ print(f"Extracted {len(training_pairs)} training pairs")
173
+ return training_pairs
174
+
175
+ def create_training_chunks(self, training_pairs: List[Tuple[str, str]]) -> List[List[Tuple[str, str]]]:
176
+ shuffled_pairs = training_pairs.copy()
177
+ random.shuffle(shuffled_pairs)
178
+
179
+ chunks = []
180
+ total_pairs = len(shuffled_pairs)
181
+
182
+ for i in range(0, total_pairs, self.chunk_size):
183
+ chunk = shuffled_pairs[i:i + self.chunk_size]
184
+ if len(chunk) >= 5:
185
+ chunks.append(chunk)
186
+
187
+ print(f"Created {len(chunks)} training chunks (target: {self.target_size_mb}MB each)")
188
+ return chunks
189
+
190
+ def train(self, training_file: str = "training_data/corpora.txt", merge_similar: bool = True):
191
+ print("Starting AgGPT-17 Training with Scalable Feather Architecture")
192
+ print("=" * 60)
193
+
194
+ cleared_count = self.feather_manager.clear_all_models()
195
+ if cleared_count > 0:
196
+ print(f"Cleared {cleared_count} existing models")
197
+
198
+ print("Loading training data...")
199
+ training_pairs = self.load_training_data(training_file)
200
+
201
+ if not training_pairs:
202
+ print("No training data found!")
203
+ return
204
+
205
+ print("Creating training chunks...")
206
+ training_chunks = self.create_training_chunks(training_pairs)
207
+
208
+ print("Training mini-models...")
209
+ trained_models = []
210
+ model_id = 1
211
+
212
+ progress_bar = tqdm(training_chunks, desc="Training mini-models")
213
+ for chunk in progress_bar:
214
+ mini_model = self.mini_trainer.train_mini_model(chunk)
215
+
216
+ if mini_model:
217
+ trained_models.append(mini_model)
218
+ self.feather_manager.save_mini_model(mini_model, model_id)
219
+ model_id += 1
220
+
221
+ progress_bar.set_postfix({
222
+ 'Models': len(trained_models),
223
+ 'Confidence': f"{mini_model['confidence']:.3f}"
224
+ })
225
+
226
+ print(f"Trained {len(trained_models)} mini-models")
227
+
228
+ if merge_similar and len(trained_models) > 1:
229
+ print("Merging similar models...")
230
+ self._merge_similar_models()
231
+
232
+ final_count = self.feather_manager.get_model_count()
233
+ print(f"Training complete! Final model count: {final_count}")
234
+ print("=" * 60)
235
+
236
+ def _merge_similar_models(self):
237
+ all_models = self.feather_manager.load_all_models()
238
+ if len(all_models) < 2:
239
+ return
240
+
241
+ merged_pairs = []
242
+ models_to_delete = set()
243
+
244
+ print(f"Checking {len(all_models)} models for merging opportunities...")
245
+
246
+ progress_bar = tqdm(range(len(all_models)), desc="Merging models")
247
+ for i in progress_bar:
248
+ if i in models_to_delete:
249
+ continue
250
+
251
+ for j in range(i + 1, len(all_models)):
252
+ if j in models_to_delete:
253
+ continue
254
+
255
+ model1 = all_models[i]
256
+ model2 = all_models[j]
257
+
258
+ if self.mini_trainer.should_merge_models(model1, model2):
259
+ merged_model = self.mini_trainer.merge_mini_models(model1, model2)
260
+
261
+ new_id = self.feather_manager.get_next_model_id()
262
+ self.feather_manager.save_mini_model(merged_model, new_id)
263
+
264
+ models_to_delete.add(i)
265
+ models_to_delete.add(j)
266
+ merged_pairs.append((model1.get('model_id', i), model2.get('model_id', j), new_id))
267
+
268
+ break
269
+
270
+ for model_idx in models_to_delete:
271
+ if model_idx < len(all_models):
272
+ model_id = all_models[model_idx].get('model_id', model_idx + 1)
273
+ self.feather_manager.delete_model(model_id)
274
+
275
+ if merged_pairs:
276
+ print(f"Merged {len(merged_pairs)} pairs of similar models")
277
+ else:
278
+ print("No similar models found for merging")
279
+
280
+
281
+ def main():
282
+ print("AgGPT-17 Scalable Feather Architecture Trainer")
283
+ print("=" * 50)
284
+
285
+ trainer = AgGPTTrainer()
286
+
287
+ try:
288
+ trainer.train(merge_similar=True)
289
+ except KeyboardInterrupt:
290
+ print("\nTraining interrupted by user")
291
+ except Exception as e:
292
+ print(f"Training failed: {e}")
293
+ import traceback
294
+ traceback.print_exc()
295
+
296
+
297
+ if __name__ == "__main__":
298
+ main()
training_data/corpora.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a33906ee116d1b69cc3a67eb9983ddb2fa3d18f2ea8498ca7431f9da829d354
3
+ size 49496569