|
|
import os |
|
|
import re |
|
|
import random |
|
|
import math |
|
|
from typing import List, Dict, Tuple, Set, Any |
|
|
from collections import defaultdict, Counter |
|
|
import pandas as pd |
|
|
from tqdm import tqdm |
|
|
|
|
|
from feather import FeatherManager, similarity_score, calculate_confidence_score |
|
|
|
|
|
class GrammarRules: |
|
|
|
|
|
@staticmethod |
|
|
def apply_all_rules(text: str) -> str: |
|
|
if not text: |
|
|
return text |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
class PatternExtractor: |
|
|
|
|
|
def __init__(self): |
|
|
self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'shall'} |
|
|
|
|
|
def extract_keywords(self, text: str) -> List[str]: |
|
|
if not text: |
|
|
return [] |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) |
|
|
|
|
|
keywords = [word for word in words if word not in self.stop_words and len(word) > 2] |
|
|
|
|
|
return list(set(keywords)) |
|
|
|
|
|
def create_pattern(self, user_input: str) -> str: |
|
|
if not user_input: |
|
|
return "" |
|
|
|
|
|
pattern = re.sub(r'\s+', ' ', user_input.strip().lower()) |
|
|
|
|
|
pattern = f" {pattern} " |
|
|
|
|
|
return pattern |
|
|
|
|
|
def calculate_pattern_similarity(self, pattern1: str, pattern2: str) -> float: |
|
|
return similarity_score(pattern1.strip(), pattern2.strip()) |
|
|
|
|
|
|
|
|
class MiniModelTrainer: |
|
|
|
|
|
def __init__(self, feather_manager: FeatherManager): |
|
|
self.feather_manager = feather_manager |
|
|
self.pattern_extractor = PatternExtractor() |
|
|
self.grammar_rules = GrammarRules() |
|
|
|
|
|
def train_mini_model(self, training_pairs: List[Tuple[str, str]], confidence_threshold: float = 0.1) -> Dict[str, Any]: |
|
|
if not training_pairs or len(training_pairs) < 2: |
|
|
return None |
|
|
|
|
|
patterns = [] |
|
|
responses = [] |
|
|
weights = [] |
|
|
all_keywords = [] |
|
|
|
|
|
for user_input, ai_response in training_pairs: |
|
|
processed_response = ai_response.strip() |
|
|
|
|
|
pattern = self.pattern_extractor.create_pattern(user_input) |
|
|
|
|
|
keywords = self.pattern_extractor.extract_keywords(user_input) |
|
|
all_keywords.extend(keywords) |
|
|
|
|
|
patterns.append(pattern) |
|
|
responses.append(processed_response) |
|
|
weights.append(1.0) |
|
|
|
|
|
confidence = min(0.9, len(training_pairs) / 20.0) |
|
|
|
|
|
keyword_counter = Counter(all_keywords) |
|
|
top_keywords = [word for word, count in keyword_counter.most_common(10)] |
|
|
|
|
|
mini_model = { |
|
|
'patterns': patterns, |
|
|
'responses': responses, |
|
|
'weights': weights, |
|
|
'confidence': confidence, |
|
|
'grammar_rules': [], |
|
|
'keywords': top_keywords, |
|
|
'training_samples': len(training_pairs) |
|
|
} |
|
|
|
|
|
return mini_model |
|
|
|
|
|
def should_merge_models(self, model1: Dict[str, Any], model2: Dict[str, Any], merge_threshold: float = 0.8) -> bool: |
|
|
keywords1 = set(model1.get('keywords', [])) |
|
|
keywords2 = set(model2.get('keywords', [])) |
|
|
|
|
|
if not keywords1 or not keywords2: |
|
|
return False |
|
|
|
|
|
keyword_similarity = len(keywords1.intersection(keywords2)) / len(keywords1.union(keywords2)) |
|
|
|
|
|
responses1 = model1.get('responses', []) |
|
|
responses2 = model2.get('responses', []) |
|
|
|
|
|
response_similarities = [] |
|
|
for r1 in responses1[:5]: |
|
|
for r2 in responses2[:5]: |
|
|
sim = similarity_score(r1, r2) |
|
|
response_similarities.append(sim) |
|
|
|
|
|
avg_response_similarity = sum(response_similarities) / len(response_similarities) if response_similarities else 0 |
|
|
|
|
|
min_confidence = min(model1.get('confidence', 0), model2.get('confidence', 0)) |
|
|
|
|
|
return (keyword_similarity > merge_threshold and |
|
|
avg_response_similarity > merge_threshold and |
|
|
min_confidence > 0.7) |
|
|
|
|
|
def merge_mini_models(self, model1: Dict[str, Any], model2: Dict[str, Any]) -> Dict[str, Any]: |
|
|
merged_model = { |
|
|
'patterns': model1.get('patterns', []) + model2.get('patterns', []), |
|
|
'responses': model1.get('responses', []) + model2.get('responses', []), |
|
|
'weights': model1.get('weights', []) + model2.get('weights', []), |
|
|
'confidence': (model1.get('confidence', 0) + model2.get('confidence', 0)) / 2, |
|
|
'grammar_rules': list(set(model1.get('grammar_rules', []) + model2.get('grammar_rules', []))), |
|
|
'keywords': list(set(model1.get('keywords', []) + model2.get('keywords', []))), |
|
|
'training_samples': model1.get('training_samples', 0) + model2.get('training_samples', 0) |
|
|
} |
|
|
|
|
|
return merged_model |
|
|
|
|
|
|
|
|
class AgGPTTrainer: |
|
|
|
|
|
def __init__(self, models_dir: str = "models"): |
|
|
self.feather_manager = FeatherManager(models_dir) |
|
|
self.mini_trainer = MiniModelTrainer(self.feather_manager) |
|
|
self.target_size_mb = 5 |
|
|
self.estimated_size_per_pair = 1000 |
|
|
self.chunk_size = (self.target_size_mb * 1024 * 1024) // self.estimated_size_per_pair |
|
|
|
|
|
def load_training_data(self, file_path: str) -> List[Tuple[str, str]]: |
|
|
training_pairs = [] |
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
conversations = content.split('<eos>') |
|
|
|
|
|
print(f"Processing {len(conversations)} conversation chunks...") |
|
|
|
|
|
for conversation in tqdm(conversations, desc="Parsing conversations"): |
|
|
conversation = conversation.strip() |
|
|
if not conversation: |
|
|
continue |
|
|
|
|
|
user_match = re.search(r'user:\s*(.*?)(?=\n<pad>|\nai:|$)', conversation, re.DOTALL) |
|
|
ai_match = re.search(r'ai:\s*(.*?)$', conversation, re.DOTALL) |
|
|
|
|
|
if user_match and ai_match: |
|
|
user_input = user_match.group(1).strip() |
|
|
ai_response = ai_match.group(1).strip() |
|
|
|
|
|
user_input = re.sub(r'<pad>', '', user_input).strip() |
|
|
ai_response = re.sub(r'<pad>', '', ai_response).strip() |
|
|
|
|
|
if user_input and ai_response and len(user_input) > 0 and len(ai_response) > 0: |
|
|
training_pairs.append((user_input, ai_response)) |
|
|
|
|
|
print(f"Extracted {len(training_pairs)} training pairs") |
|
|
return training_pairs |
|
|
|
|
|
def create_training_chunks(self, training_pairs: List[Tuple[str, str]]) -> List[List[Tuple[str, str]]]: |
|
|
shuffled_pairs = training_pairs.copy() |
|
|
random.shuffle(shuffled_pairs) |
|
|
|
|
|
chunks = [] |
|
|
total_pairs = len(shuffled_pairs) |
|
|
|
|
|
for i in range(0, total_pairs, self.chunk_size): |
|
|
chunk = shuffled_pairs[i:i + self.chunk_size] |
|
|
if len(chunk) >= 5: |
|
|
chunks.append(chunk) |
|
|
|
|
|
print(f"Created {len(chunks)} training chunks (target: {self.target_size_mb}MB each)") |
|
|
return chunks |
|
|
|
|
|
def train(self, training_file: str = "training_data/corpora.txt", merge_similar: bool = True): |
|
|
print("Starting AgGPT-17 Training with Scalable Feather Architecture") |
|
|
print("=" * 60) |
|
|
|
|
|
cleared_count = self.feather_manager.clear_all_models() |
|
|
if cleared_count > 0: |
|
|
print(f"Cleared {cleared_count} existing models") |
|
|
|
|
|
print("Loading training data...") |
|
|
training_pairs = self.load_training_data(training_file) |
|
|
|
|
|
if not training_pairs: |
|
|
print("No training data found!") |
|
|
return |
|
|
|
|
|
print("Creating training chunks...") |
|
|
training_chunks = self.create_training_chunks(training_pairs) |
|
|
|
|
|
print("Training mini-models...") |
|
|
trained_models = [] |
|
|
model_id = 1 |
|
|
|
|
|
progress_bar = tqdm(training_chunks, desc="Training mini-models") |
|
|
for chunk in progress_bar: |
|
|
mini_model = self.mini_trainer.train_mini_model(chunk) |
|
|
|
|
|
if mini_model: |
|
|
trained_models.append(mini_model) |
|
|
self.feather_manager.save_mini_model(mini_model, model_id) |
|
|
model_id += 1 |
|
|
|
|
|
progress_bar.set_postfix({ |
|
|
'Models': len(trained_models), |
|
|
'Confidence': f"{mini_model['confidence']:.3f}" |
|
|
}) |
|
|
|
|
|
print(f"Trained {len(trained_models)} mini-models") |
|
|
|
|
|
if merge_similar and len(trained_models) > 1: |
|
|
print("Merging similar models...") |
|
|
self._merge_similar_models() |
|
|
|
|
|
final_count = self.feather_manager.get_model_count() |
|
|
print(f"Training complete! Final model count: {final_count}") |
|
|
print("=" * 60) |
|
|
|
|
|
def _merge_similar_models(self): |
|
|
all_models = self.feather_manager.load_all_models() |
|
|
if len(all_models) < 2: |
|
|
return |
|
|
|
|
|
merged_pairs = [] |
|
|
models_to_delete = set() |
|
|
|
|
|
print(f"Checking {len(all_models)} models for merging opportunities...") |
|
|
|
|
|
progress_bar = tqdm(range(len(all_models)), desc="Merging models") |
|
|
for i in progress_bar: |
|
|
if i in models_to_delete: |
|
|
continue |
|
|
|
|
|
for j in range(i + 1, len(all_models)): |
|
|
if j in models_to_delete: |
|
|
continue |
|
|
|
|
|
model1 = all_models[i] |
|
|
model2 = all_models[j] |
|
|
|
|
|
if self.mini_trainer.should_merge_models(model1, model2): |
|
|
merged_model = self.mini_trainer.merge_mini_models(model1, model2) |
|
|
|
|
|
new_id = self.feather_manager.get_next_model_id() |
|
|
self.feather_manager.save_mini_model(merged_model, new_id) |
|
|
|
|
|
models_to_delete.add(i) |
|
|
models_to_delete.add(j) |
|
|
merged_pairs.append((model1.get('model_id', i), model2.get('model_id', j), new_id)) |
|
|
|
|
|
break |
|
|
|
|
|
for model_idx in models_to_delete: |
|
|
if model_idx < len(all_models): |
|
|
model_id = all_models[model_idx].get('model_id', model_idx + 1) |
|
|
self.feather_manager.delete_model(model_id) |
|
|
|
|
|
if merged_pairs: |
|
|
print(f"Merged {len(merged_pairs)} pairs of similar models") |
|
|
else: |
|
|
print("No similar models found for merging") |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("AgGPT-17 Scalable Feather Architecture Trainer") |
|
|
print("=" * 50) |
|
|
|
|
|
trainer = AgGPTTrainer() |
|
|
|
|
|
try: |
|
|
trainer.train(merge_similar=True) |
|
|
except KeyboardInterrupt: |
|
|
print("\nTraining interrupted by user") |
|
|
except Exception as e: |
|
|
print(f"Training failed: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|