|
|
import os |
|
|
import re |
|
|
import random |
|
|
import math |
|
|
import yaml |
|
|
from typing import List, Dict, Tuple, Set, Any |
|
|
from collections import defaultdict, Counter |
|
|
import pandas as pd |
|
|
from tqdm import tqdm |
|
|
|
|
|
from feather import FeatherManager, similarity_score, calculate_confidence_score |
|
|
|
|
|
class GrammarRules: |
|
|
@staticmethod |
|
|
def apply_all_rules(text: str) -> str: |
|
|
if not text: |
|
|
return text |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
class PatternExtractor: |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.semantic_groups = { |
|
|
'questions': ['what', 'how', 'why', 'when', 'where', 'who', 'which', 'can', 'could', 'would', 'should', 'is', 'are', 'do', 'does'], |
|
|
'greetings': ['hello', 'hi', 'hey', 'greetings', 'good morning', 'good afternoon', 'good evening'], |
|
|
'farewells': ['goodbye', 'bye', 'see you', 'farewell', 'take care'], |
|
|
'requests': ['please', 'can you', 'could you', 'would you', 'help me', 'i need', 'i want'], |
|
|
'emotions': ['happy', 'sad', 'angry', 'excited', 'worried', 'confused', 'frustrated'], |
|
|
'affirmations': ['yes', 'yeah', 'sure', 'okay', 'alright', 'definitely', 'absolutely'], |
|
|
'negations': ['no', 'not', 'never', 'nothing', 'none', 'neither'], |
|
|
} |
|
|
|
|
|
self.stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'} |
|
|
|
|
|
def extract_keywords(self, text: str) -> List[str]: |
|
|
if not text: |
|
|
return [] |
|
|
|
|
|
full_text_normalized = re.sub(r'\s+', ' ', text.strip().lower()) |
|
|
words = re.findall(r'\b[a-zA-Z]+\b', full_text_normalized) |
|
|
|
|
|
|
|
|
meaningful_words = [word for word in words if word not in self.stop_words and len(word) > 2] |
|
|
|
|
|
|
|
|
semantic_keywords = [] |
|
|
for category, category_words in self.semantic_groups.items(): |
|
|
if any(word in meaningful_words for word in category_words): |
|
|
semantic_keywords.append(f'semantic_{category}') |
|
|
|
|
|
|
|
|
entities = self._extract_simple_entities(full_text_normalized) |
|
|
|
|
|
result = [full_text_normalized] |
|
|
result.extend(meaningful_words[:10]) |
|
|
result.extend(semantic_keywords) |
|
|
result.extend(entities) |
|
|
|
|
|
return list(set(result)) |
|
|
|
|
|
def _extract_simple_entities(self, text: str) -> List[str]: |
|
|
"""Extract simple entities without external libraries""" |
|
|
entities = [] |
|
|
|
|
|
|
|
|
numbers = re.findall(r'\b\d+\b', text) |
|
|
entities.extend([f'number_{num}' for num in numbers[:3]]) |
|
|
|
|
|
|
|
|
original_words = re.findall(r'\b[A-Z][a-z]+\b', text) |
|
|
entities.extend([f'entity_{word.lower()}' for word in original_words[:3]]) |
|
|
|
|
|
|
|
|
time_patterns = ['today', 'tomorrow', 'yesterday', 'morning', 'evening', 'night', 'afternoon'] |
|
|
for pattern in time_patterns: |
|
|
if pattern in text.lower(): |
|
|
entities.append(f'time_{pattern}') |
|
|
|
|
|
return entities |
|
|
|
|
|
def create_pattern(self, user_input: str) -> str: |
|
|
if not user_input: |
|
|
return "" |
|
|
|
|
|
|
|
|
normalized = re.sub(r'\s+', ' ', user_input.strip().lower()) |
|
|
|
|
|
|
|
|
words = normalized.split() |
|
|
semantic_pattern = [] |
|
|
|
|
|
for word in words: |
|
|
|
|
|
added_semantic = False |
|
|
for category, category_words in self.semantic_groups.items(): |
|
|
if word in category_words: |
|
|
semantic_pattern.append(f'<{category}>') |
|
|
added_semantic = True |
|
|
break |
|
|
|
|
|
if not added_semantic: |
|
|
if word in self.stop_words: |
|
|
semantic_pattern.append(f'<stop>') |
|
|
elif word.isdigit(): |
|
|
semantic_pattern.append('<number>') |
|
|
elif len(word) > 6: |
|
|
semantic_pattern.append(word) |
|
|
else: |
|
|
semantic_pattern.append(f'<word>') |
|
|
|
|
|
|
|
|
literal_pattern = f" {normalized} " |
|
|
semantic_structure = " ".join(semantic_pattern) |
|
|
|
|
|
return f"{literal_pattern}|{semantic_structure}" |
|
|
|
|
|
def calculate_pattern_similarity(self, pattern1: str, pattern2: str) -> float: |
|
|
|
|
|
if not pattern1 or not pattern2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
parts1 = pattern1.strip().split('|') |
|
|
parts2 = pattern2.strip().split('|') |
|
|
|
|
|
literal1 = parts1[0].strip() |
|
|
literal2 = parts2[0].strip() |
|
|
|
|
|
|
|
|
literal_sim = similarity_score(literal1, literal2) |
|
|
|
|
|
|
|
|
semantic_sim = 0.0 |
|
|
if len(parts1) > 1 and len(parts2) > 1: |
|
|
semantic1 = parts1[1].strip() |
|
|
semantic2 = parts2[1].strip() |
|
|
semantic_sim = self._semantic_structure_similarity(semantic1, semantic2) |
|
|
|
|
|
|
|
|
if semantic_sim > 0: |
|
|
return (literal_sim * 0.7 + semantic_sim * 0.3) |
|
|
else: |
|
|
return literal_sim |
|
|
|
|
|
def _semantic_structure_similarity(self, struct1: str, struct2: str) -> float: |
|
|
"""Compare semantic structures""" |
|
|
if not struct1 or not struct2: |
|
|
return 0.0 |
|
|
|
|
|
tokens1 = struct1.split() |
|
|
tokens2 = struct2.split() |
|
|
|
|
|
if not tokens1 or not tokens2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
matches = 0 |
|
|
total = max(len(tokens1), len(tokens2)) |
|
|
|
|
|
for i in range(min(len(tokens1), len(tokens2))): |
|
|
if tokens1[i] == tokens2[i]: |
|
|
matches += 1 |
|
|
elif tokens1[i].startswith('<') and tokens2[i].startswith('<'): |
|
|
|
|
|
matches += 0.5 |
|
|
|
|
|
return matches / total if total > 0 else 0.0 |
|
|
|
|
|
|
|
|
class MiniModelTrainer: |
|
|
|
|
|
def __init__(self, feather_manager: FeatherManager): |
|
|
self.feather_manager = feather_manager |
|
|
self.pattern_extractor = PatternExtractor() |
|
|
self.grammar_rules = GrammarRules() |
|
|
|
|
|
def train_mini_model(self, training_pairs: List[Tuple[str, str]], confidence_threshold: float = 0.1) -> Dict[str, Any]: |
|
|
if not training_pairs or len(training_pairs) < 2: |
|
|
return None |
|
|
|
|
|
|
|
|
keyword_patterns = [] |
|
|
responses = [] |
|
|
pattern_confidences = [] |
|
|
all_keywords = [] |
|
|
response_templates = [] |
|
|
knowledge_base = {} |
|
|
|
|
|
for user_input, ai_response in training_pairs: |
|
|
processed_response = ai_response.strip() |
|
|
|
|
|
|
|
|
pattern = self.pattern_extractor.create_pattern(user_input) |
|
|
keywords = self.pattern_extractor.extract_keywords(user_input) |
|
|
all_keywords.extend(keywords) |
|
|
|
|
|
|
|
|
template = self._create_response_template(ai_response, user_input) |
|
|
response_templates.append(template) |
|
|
|
|
|
|
|
|
knowledge_entry = self._extract_knowledge(user_input, ai_response) |
|
|
if knowledge_entry: |
|
|
knowledge_base.update(knowledge_entry) |
|
|
|
|
|
|
|
|
keyword_patterns.append(pattern) |
|
|
responses.append(processed_response) |
|
|
individual_confidence = min(0.9, len(training_pairs) / 20.0) |
|
|
pattern_confidences.append(individual_confidence) |
|
|
|
|
|
if not keyword_patterns: |
|
|
return None |
|
|
|
|
|
base_confidence = min(0.9, len(training_pairs) / 20.0) |
|
|
keyword_counter = Counter(all_keywords) |
|
|
top_keywords = [word for word, count in keyword_counter.most_common(15)] |
|
|
|
|
|
|
|
|
mini_model = { |
|
|
'patterns': keyword_patterns, |
|
|
'responses': responses, |
|
|
'response_templates': response_templates, |
|
|
'knowledge_base': knowledge_base, |
|
|
'pattern_confidences': pattern_confidences, |
|
|
'confidence': base_confidence, |
|
|
'grammar_rules': [], |
|
|
'keywords': top_keywords, |
|
|
'training_samples': len(training_pairs), |
|
|
'semantic_categories': self._analyze_semantic_categories(training_pairs) |
|
|
} |
|
|
|
|
|
return mini_model |
|
|
|
|
|
def _create_response_template(self, response: str, input_text: str) -> Dict[str, Any]: |
|
|
"""Create a template for generating similar responses""" |
|
|
|
|
|
template = { |
|
|
'structure': 'direct', |
|
|
'length': 'medium', |
|
|
'tone': 'neutral', |
|
|
'placeholders': [], |
|
|
'key_phrases': [], |
|
|
} |
|
|
|
|
|
words = response.split() |
|
|
|
|
|
|
|
|
if '?' in response: |
|
|
template['structure'] = 'question' |
|
|
elif any(word in response.lower() for word in ['first', 'second', 'then', 'next', '1.', '2.']): |
|
|
template['structure'] = 'list' |
|
|
elif len(words) > 50: |
|
|
template['structure'] = 'explanation' |
|
|
|
|
|
|
|
|
if len(words) < 10: |
|
|
template['length'] = 'short' |
|
|
elif len(words) > 30: |
|
|
template['length'] = 'long' |
|
|
|
|
|
|
|
|
if any(word in response.lower() for word in ['please', 'thank you', 'great', 'wonderful']): |
|
|
template['tone'] = 'friendly' |
|
|
elif any(word in response.lower() for word in ['hey', 'yeah', 'cool', 'awesome']): |
|
|
template['tone'] = 'casual' |
|
|
|
|
|
|
|
|
sentences = response.split('.') |
|
|
template['key_phrases'] = [sent.strip() for sent in sentences if sent.strip() and len(sent.strip()) > 10][:3] |
|
|
|
|
|
return template |
|
|
|
|
|
def _extract_knowledge(self, question: str, answer: str) -> Dict[str, str]: |
|
|
"""Extract knowledge facts from Q&A pairs""" |
|
|
knowledge = {} |
|
|
|
|
|
|
|
|
question_lower = question.lower() |
|
|
|
|
|
|
|
|
if any(word in question_lower for word in ['what is', 'what are', 'define']): |
|
|
subject = self._extract_subject(question) |
|
|
if subject: |
|
|
knowledge[f'definition_{subject}'] = answer[:200] |
|
|
|
|
|
|
|
|
elif 'how to' in question_lower or 'how do' in question_lower: |
|
|
topic = question_lower.replace('how to', '').replace('how do', '').strip() |
|
|
if topic: |
|
|
knowledge[f'howto_{topic[:20]}'] = answer[:300] |
|
|
|
|
|
|
|
|
elif any(word in question_lower for word in ['where', 'when', 'who', 'which']): |
|
|
knowledge[f'fact_{hash(question) % 10000}'] = answer[:150] |
|
|
|
|
|
return knowledge |
|
|
|
|
|
def _extract_subject(self, question: str) -> str: |
|
|
"""Extract the main subject from a question""" |
|
|
words = question.lower().split() |
|
|
|
|
|
|
|
|
question_words = {'what', 'is', 'are', 'the', 'a', 'an'} |
|
|
filtered_words = [word for word in words if word not in question_words] |
|
|
|
|
|
if filtered_words: |
|
|
return '_'.join(filtered_words[:3]) |
|
|
|
|
|
return '' |
|
|
|
|
|
def _analyze_semantic_categories(self, training_pairs: List[Tuple[str, str]]) -> Dict[str, int]: |
|
|
"""Analyze what types of conversations this model handles""" |
|
|
categories = { |
|
|
'questions': 0, 'greetings': 0, 'requests': 0, 'explanations': 0, |
|
|
'personal': 0, 'technical': 0, 'casual': 0, 'factual': 0 |
|
|
} |
|
|
|
|
|
for user_input, ai_response in training_pairs: |
|
|
input_lower = user_input.lower() |
|
|
|
|
|
|
|
|
if any(word in input_lower for word in ['what', 'how', 'why', 'when', 'where']): |
|
|
categories['questions'] += 1 |
|
|
if any(word in input_lower for word in ['hello', 'hi', 'hey']): |
|
|
categories['greetings'] += 1 |
|
|
if any(word in input_lower for word in ['please', 'can you', 'help']): |
|
|
categories['requests'] += 1 |
|
|
if any(word in input_lower for word in ['i', 'my', 'me']): |
|
|
categories['personal'] += 1 |
|
|
if any(word in input_lower for word in ['code', 'program', 'technical', 'computer']): |
|
|
categories['technical'] += 1 |
|
|
if len(ai_response.split()) > 30: |
|
|
categories['explanations'] += 1 |
|
|
|
|
|
return categories |
|
|
|
|
|
def should_merge_models(self, model1: Dict[str, Any], model2: Dict[str, Any], merge_threshold: float = 0.8) -> bool: |
|
|
keywords1 = set(model1.get('keywords', [])) |
|
|
keywords2 = set(model2.get('keywords', [])) |
|
|
|
|
|
if not keywords1 or not keywords2: |
|
|
return False |
|
|
|
|
|
keyword_similarity = len(keywords1.intersection(keywords2)) / len(keywords1.union(keywords2)) |
|
|
|
|
|
responses1 = model1.get('responses', []) |
|
|
responses2 = model2.get('responses', []) |
|
|
|
|
|
response_similarities = [] |
|
|
for r1 in responses1[:5]: |
|
|
for r2 in responses2[:5]: |
|
|
sim = similarity_score(r1, r2) |
|
|
response_similarities.append(sim) |
|
|
|
|
|
avg_response_similarity = sum(response_similarities) / len(response_similarities) if response_similarities else 0 |
|
|
|
|
|
min_confidence = min(model1.get('confidence', 0), model2.get('confidence', 0)) |
|
|
|
|
|
return (keyword_similarity > merge_threshold and |
|
|
avg_response_similarity > merge_threshold and |
|
|
min_confidence > 0.7) |
|
|
|
|
|
def merge_mini_models(self, model1: Dict[str, Any], model2: Dict[str, Any]) -> Dict[str, Any]: |
|
|
patterns1 = model1.get('patterns', []) |
|
|
patterns2 = model2.get('patterns', []) |
|
|
responses1 = model1.get('responses', []) |
|
|
responses2 = model2.get('responses', []) |
|
|
confidences1 = model1.get('pattern_confidences', [1.0] * len(patterns1)) |
|
|
confidences2 = model2.get('pattern_confidences', [1.0] * len(patterns2)) |
|
|
|
|
|
merged_model = { |
|
|
'patterns': patterns1 + patterns2, |
|
|
'responses': responses1 + responses2, |
|
|
'pattern_confidences': confidences1 + confidences2, |
|
|
'confidence': (model1.get('confidence', 0) + model2.get('confidence', 0)) / 2, |
|
|
'grammar_rules': list(set(model1.get('grammar_rules', []) + model2.get('grammar_rules', []))), |
|
|
'keywords': list(set(model1.get('keywords', []) + model2.get('keywords', []))), |
|
|
'training_samples': model1.get('training_samples', 0) + model2.get('training_samples', 0) |
|
|
} |
|
|
|
|
|
return merged_model |
|
|
|
|
|
|
|
|
class AgGPTTrainer: |
|
|
|
|
|
def __init__(self, models_dir: str = "models"): |
|
|
self.feather_manager = FeatherManager(models_dir) |
|
|
self.mini_trainer = MiniModelTrainer(self.feather_manager) |
|
|
self.target_size_mb = 5 |
|
|
self.estimated_size_per_pair = 1000 |
|
|
self.chunk_size = (self.target_size_mb * 1024 * 1024) // self.estimated_size_per_pair |
|
|
self.readable_weights_dir = "readable_weights" |
|
|
os.makedirs(self.readable_weights_dir, exist_ok=True) |
|
|
|
|
|
def save_model_as_yaml(self, model_data: Dict[str, Any], model_id: int): |
|
|
try: |
|
|
filename = f"AgGPT_Model_{model_id:04d}.yaml" |
|
|
filepath = os.path.join(self.readable_weights_dir, filename) |
|
|
|
|
|
print(f"Creating YAML data for model {model_id}...") |
|
|
yaml_data = { |
|
|
'model_info': { |
|
|
'model_id': model_id, |
|
|
'confidence': model_data.get('confidence', 0.5), |
|
|
'training_samples': model_data.get('training_samples', 0), |
|
|
'keywords': model_data.get('keywords', []) |
|
|
}, |
|
|
'patterns_and_responses': [] |
|
|
} |
|
|
|
|
|
patterns = model_data.get('patterns', []) |
|
|
responses = model_data.get('responses', []) |
|
|
weights = model_data.get('weights', []) |
|
|
|
|
|
print(f"Processing {len(patterns)} patterns...") |
|
|
for i in range(len(patterns)): |
|
|
entry = { |
|
|
'pattern': patterns[i] if i < len(patterns) else '', |
|
|
'response': responses[i] if i < len(responses) else '', |
|
|
'weight': weights[i] if i < len(weights) else 1.0 |
|
|
} |
|
|
yaml_data['patterns_and_responses'].append(entry) |
|
|
|
|
|
print(f"Writing YAML to {filepath}...") |
|
|
with open(filepath, 'w', encoding='utf-8') as f: |
|
|
yaml.dump(yaml_data, f, default_flow_style=False, allow_unicode=True, indent=2) |
|
|
|
|
|
print(f"Saved readable model: {filename}") |
|
|
except Exception as e: |
|
|
print(f"Error in save_model_as_yaml: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
def load_training_data(self, file_path: str) -> List[Tuple[str, str]]: |
|
|
training_pairs = [] |
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
conversations = content.split('<eos>') |
|
|
|
|
|
print(f"Processing {len(conversations)} conversation chunks...") |
|
|
|
|
|
for conversation in tqdm(conversations, desc="Parsing conversations"): |
|
|
conversation = conversation.strip() |
|
|
if not conversation: |
|
|
continue |
|
|
|
|
|
user_match = re.search(r'user:\s*(.*?)(?=\n<pad>|\nai:|$)', conversation, re.DOTALL) |
|
|
ai_match = re.search(r'ai:\s*(.*?)$', conversation, re.DOTALL) |
|
|
|
|
|
if user_match and ai_match: |
|
|
user_input = user_match.group(1).strip() |
|
|
ai_response = ai_match.group(1).strip() |
|
|
|
|
|
user_input = re.sub(r'<pad>', '', user_input).strip() |
|
|
ai_response = re.sub(r'<pad>', '', ai_response).strip() |
|
|
|
|
|
if user_input and ai_response and len(user_input) > 0 and len(ai_response) > 0: |
|
|
training_pairs.append((user_input, ai_response)) |
|
|
|
|
|
print(f"Extracted {len(training_pairs)} training pairs") |
|
|
return training_pairs |
|
|
|
|
|
def create_training_chunks(self, training_pairs: List[Tuple[str, str]]) -> List[List[Tuple[str, str]]]: |
|
|
shuffled_pairs = training_pairs.copy() |
|
|
random.shuffle(shuffled_pairs) |
|
|
|
|
|
chunks = [] |
|
|
total_pairs = len(shuffled_pairs) |
|
|
|
|
|
for i in range(0, total_pairs, self.chunk_size): |
|
|
chunk = shuffled_pairs[i:i + self.chunk_size] |
|
|
if len(chunk) >= 5: |
|
|
chunks.append(chunk) |
|
|
|
|
|
print(f"Created {len(chunks)} training chunks (target: {self.target_size_mb}MB each)") |
|
|
return chunks |
|
|
|
|
|
def train_multiple_corpora(self, training_files: List[str] = None, merge_similar: bool = True): |
|
|
"""Train on multiple corpora files sequentially""" |
|
|
if training_files is None: |
|
|
|
|
|
training_dir = "training_corpora" |
|
|
if os.path.exists(training_dir): |
|
|
training_files = [] |
|
|
for filename in sorted(os.listdir(training_dir)): |
|
|
if filename.endswith('.txt'): |
|
|
training_files.append(os.path.join(training_dir, filename)) |
|
|
print(f"Found {len(training_files)} text files in {training_dir}") |
|
|
else: |
|
|
print(f"Warning: {training_dir} directory not found, falling back to default files") |
|
|
training_files = ["training_data/corpora.txt", "training_data/corpora2.txt"] |
|
|
|
|
|
print("Starting AgGPT-19 Multi-Corpora Training with Enhanced Intelligence") |
|
|
print("=" * 70) |
|
|
|
|
|
cleared_count = self.feather_manager.clear_all_models() |
|
|
if cleared_count > 0: |
|
|
print(f"Cleared {cleared_count} existing models") |
|
|
|
|
|
all_trained_models = [] |
|
|
total_model_id = 1 |
|
|
|
|
|
for file_idx, training_file in enumerate(training_files, 1): |
|
|
print(f"\n--- Training on file {file_idx}/{len(training_files)}: {training_file} ---") |
|
|
|
|
|
if not os.path.exists(training_file): |
|
|
print(f"Warning: Training file {training_file} does not exist. Skipping...") |
|
|
continue |
|
|
|
|
|
if os.path.getsize(training_file) == 0: |
|
|
print(f"Warning: Training file {training_file} is empty. Skipping...") |
|
|
continue |
|
|
|
|
|
print(f"Loading training data from {training_file}...") |
|
|
training_pairs = self.load_training_data(training_file) |
|
|
|
|
|
if not training_pairs: |
|
|
print(f"No training data found in {training_file}. Skipping...") |
|
|
continue |
|
|
|
|
|
print(f"Creating training chunks for {training_file}...") |
|
|
training_chunks = self.create_training_chunks(training_pairs) |
|
|
|
|
|
print(f"Training mini-models from {training_file}...") |
|
|
file_trained_models = [] |
|
|
|
|
|
progress_bar = tqdm(training_chunks, desc=f"Training from {os.path.basename(training_file)}") |
|
|
for chunk_idx, chunk in enumerate(progress_bar): |
|
|
print(f"\nProcessing chunk {chunk_idx + 1}/{len(training_chunks)}") |
|
|
mini_model = self.mini_trainer.train_mini_model(chunk) |
|
|
|
|
|
if mini_model: |
|
|
file_trained_models.append(mini_model) |
|
|
all_trained_models.append(mini_model) |
|
|
print(f"Saving model {total_model_id}...") |
|
|
self.feather_manager.save_mini_model(mini_model, total_model_id) |
|
|
|
|
|
if total_model_id == 1: |
|
|
print("Saving first model as YAML...") |
|
|
try: |
|
|
self.save_model_as_yaml(mini_model, total_model_id) |
|
|
print("YAML saved successfully") |
|
|
except Exception as e: |
|
|
print(f"Error saving YAML: {e}") |
|
|
|
|
|
total_model_id += 1 |
|
|
print(f"Model {total_model_id - 1} completed") |
|
|
|
|
|
try: |
|
|
progress_bar.set_postfix({ |
|
|
'File Models': len(file_trained_models), |
|
|
'Total Models': len(all_trained_models), |
|
|
'Confidence': f"{mini_model['confidence']:.3f}" |
|
|
}) |
|
|
except Exception as e: |
|
|
print(f"Error updating progress bar: {e}") |
|
|
|
|
|
print(f"Completed training on {training_file}: {len(file_trained_models)} mini-models created") |
|
|
print(f"Total models so far: {len(all_trained_models)}") |
|
|
|
|
|
if merge_similar and len(all_trained_models) > 1: |
|
|
print(f"Merging similar models after processing {training_file}...") |
|
|
self._merge_similar_models() |
|
|
current_count = self.feather_manager.get_model_count() |
|
|
print(f"Models after merging: {current_count}") |
|
|
|
|
|
print(f"\n--- Multi-Corpora Training Complete ---") |
|
|
final_count = self.feather_manager.get_model_count() |
|
|
print(f"Final model count: {final_count}") |
|
|
print(f"Trained on {len([f for f in training_files if os.path.exists(f) and os.path.getsize(f) > 0])} corpora files") |
|
|
print("=" * 70) |
|
|
|
|
|
def train(self, training_file: str = "training_data/corpora.txt", merge_similar: bool = True): |
|
|
print("Starting AgGPT-19 Training with Enhanced Intelligence") |
|
|
print("=" * 60) |
|
|
|
|
|
cleared_count = self.feather_manager.clear_all_models() |
|
|
if cleared_count > 0: |
|
|
print(f"Cleared {cleared_count} existing models") |
|
|
|
|
|
print("Loading training data...") |
|
|
training_pairs = self.load_training_data(training_file) |
|
|
|
|
|
if not training_pairs: |
|
|
print("No training data found!") |
|
|
return |
|
|
|
|
|
print("Creating training chunks...") |
|
|
training_chunks = self.create_training_chunks(training_pairs) |
|
|
|
|
|
print("Training mini-models...") |
|
|
trained_models = [] |
|
|
model_id = 1 |
|
|
|
|
|
progress_bar = tqdm(training_chunks, desc="Training mini-models") |
|
|
for chunk in progress_bar: |
|
|
mini_model = self.mini_trainer.train_mini_model(chunk) |
|
|
|
|
|
if mini_model: |
|
|
trained_models.append(mini_model) |
|
|
self.feather_manager.save_mini_model(mini_model, model_id) |
|
|
|
|
|
if model_id == 1: |
|
|
self.save_model_as_yaml(mini_model, model_id) |
|
|
|
|
|
model_id += 1 |
|
|
|
|
|
progress_bar.set_postfix({ |
|
|
'Models': len(trained_models), |
|
|
'Confidence': f"{mini_model['confidence']:.3f}" |
|
|
}) |
|
|
|
|
|
print(f"Trained {len(trained_models)} mini-models") |
|
|
|
|
|
if merge_similar and len(trained_models) > 1: |
|
|
print("Merging similar models...") |
|
|
self._merge_similar_models() |
|
|
|
|
|
final_count = self.feather_manager.get_model_count() |
|
|
print(f"Training complete! Final model count: {final_count}") |
|
|
print("=" * 60) |
|
|
|
|
|
def _merge_similar_models(self): |
|
|
all_models = self.feather_manager.load_all_models() |
|
|
if len(all_models) < 2: |
|
|
return |
|
|
|
|
|
merged_pairs = [] |
|
|
models_to_delete = set() |
|
|
|
|
|
print(f"Checking {len(all_models)} models for merging opportunities...") |
|
|
|
|
|
progress_bar = tqdm(range(len(all_models)), desc="Merging models") |
|
|
for i in progress_bar: |
|
|
if i in models_to_delete: |
|
|
continue |
|
|
|
|
|
for j in range(i + 1, len(all_models)): |
|
|
if j in models_to_delete: |
|
|
continue |
|
|
|
|
|
model1 = all_models[i] |
|
|
model2 = all_models[j] |
|
|
|
|
|
if self.mini_trainer.should_merge_models(model1, model2): |
|
|
merged_model = self.mini_trainer.merge_mini_models(model1, model2) |
|
|
|
|
|
new_id = self.feather_manager.get_next_model_id() |
|
|
self.feather_manager.save_mini_model(merged_model, new_id) |
|
|
|
|
|
models_to_delete.add(i) |
|
|
models_to_delete.add(j) |
|
|
merged_pairs.append((model1.get('model_id', i), model2.get('model_id', j), new_id)) |
|
|
|
|
|
break |
|
|
|
|
|
for model_idx in models_to_delete: |
|
|
if model_idx < len(all_models): |
|
|
model_id = all_models[model_idx].get('model_id', model_idx + 1) |
|
|
self.feather_manager.delete_model(model_id) |
|
|
|
|
|
if merged_pairs: |
|
|
print(f"Merged {len(merged_pairs)} pairs of similar models") |
|
|
else: |
|
|
print("No similar models found for merging") |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("AgGPT-19 Enhanced Intelligence Trainer") |
|
|
print("=" * 50) |
|
|
|
|
|
trainer = AgGPTTrainer() |
|
|
|
|
|
try: |
|
|
trainer.train_multiple_corpora(merge_similar=False) |
|
|
except KeyboardInterrupt: |
|
|
print("\nTraining interrupted by user") |
|
|
except Exception as e: |
|
|
print(f"Training failed: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|