|
|
import pandas as pd |
|
|
import os |
|
|
from typing import Dict, List, Any, Optional |
|
|
import math |
|
|
|
|
|
|
|
|
class FeatherManager: |
|
|
|
|
|
def __init__(self, models_dir: str = "models"): |
|
|
self.models_dir = models_dir |
|
|
os.makedirs(models_dir, exist_ok=True) |
|
|
|
|
|
def save_mini_model(self, model_data: Dict[str, Any], model_id: int) -> str: |
|
|
filename = f"AgGPT_Expert_{model_id:04d}.feather" |
|
|
filepath = os.path.join(self.models_dir, filename) |
|
|
|
|
|
patterns = model_data.get('patterns', []) |
|
|
responses = model_data.get('responses', []) |
|
|
|
|
|
if not patterns or not responses: |
|
|
print(f"Warning: Model {model_id} has empty patterns or responses") |
|
|
patterns = patterns or ['hello'] |
|
|
responses = responses or ['Hello!'] |
|
|
|
|
|
|
|
|
|
|
|
df_data = { |
|
|
'patterns': [str(pattern) for pattern in patterns], |
|
|
'responses': [str(response) for response in responses], |
|
|
'weights': model_data.get('weights', [1.0] * len(patterns)), |
|
|
'response_templates': model_data.get('response_templates', [{}] * len(patterns)), |
|
|
} |
|
|
|
|
|
|
|
|
max_len = len(patterns) |
|
|
for key, value in df_data.items(): |
|
|
if isinstance(value, list): |
|
|
while len(value) < max_len: |
|
|
if key == 'weights': |
|
|
value.append(1.0) |
|
|
elif key == 'response_templates': |
|
|
value.append({}) |
|
|
else: |
|
|
value.append('') |
|
|
|
|
|
|
|
|
model_metadata = { |
|
|
'model_id': model_id, |
|
|
'confidence': model_data.get('confidence', 0.5), |
|
|
'keywords': ' '.join(model_data.get('keywords', [])), |
|
|
'training_samples': model_data.get('training_samples', 0), |
|
|
'knowledge_base': str(model_data.get('knowledge_base', {})), |
|
|
'semantic_categories': str(model_data.get('semantic_categories', {})), |
|
|
'grammar_rules': str(model_data.get('grammar_rules', [])) |
|
|
} |
|
|
|
|
|
|
|
|
df_patterns = pd.DataFrame(df_data) |
|
|
|
|
|
|
|
|
df_metadata = pd.DataFrame([model_metadata]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metadata_row = { |
|
|
'patterns': f"__METADATA__{model_metadata['model_id']}", |
|
|
'responses': model_metadata['knowledge_base'], |
|
|
'weights': float(model_metadata['confidence']), |
|
|
'response_templates': f"{model_metadata['keywords']}|{model_metadata['training_samples']}|{model_metadata['semantic_categories']}|{model_metadata['grammar_rules']}" |
|
|
} |
|
|
|
|
|
|
|
|
combined_data = [metadata_row] |
|
|
|
|
|
for i in range(len(patterns)): |
|
|
row = { |
|
|
'patterns': df_data['patterns'][i], |
|
|
'responses': df_data['responses'][i], |
|
|
'weights': df_data['weights'][i], |
|
|
'response_templates': str(df_data['response_templates'][i]) |
|
|
} |
|
|
combined_data.append(row) |
|
|
|
|
|
df_combined = pd.DataFrame(combined_data) |
|
|
df_combined.to_feather(filepath) |
|
|
|
|
|
print(f"Saved optimized mini-model: {filename} ({len(patterns)} patterns + metadata)") |
|
|
return filepath |
|
|
|
|
|
def load_mini_model(self, model_id: int) -> Optional[Dict[str, Any]]: |
|
|
filename = f"AgGPT_Expert_{model_id:04d}.feather" |
|
|
filepath = os.path.join(self.models_dir, filename) |
|
|
|
|
|
if not os.path.exists(filepath): |
|
|
return None |
|
|
|
|
|
try: |
|
|
df = pd.read_feather(filepath) |
|
|
|
|
|
|
|
|
if len(df) == 0: |
|
|
return None |
|
|
|
|
|
|
|
|
if df['patterns'].iloc[0].startswith('__METADATA__'): |
|
|
|
|
|
metadata_parts = df['response_templates'].iloc[0].split('|') |
|
|
|
|
|
model_data = { |
|
|
'model_id': model_id, |
|
|
'confidence': float(df['weights'].iloc[0]), |
|
|
'keywords': metadata_parts[0].split() if len(metadata_parts) > 0 else [], |
|
|
'training_samples': int(metadata_parts[1]) if len(metadata_parts) > 1 and metadata_parts[1].isdigit() else 0, |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
kb_str = df['responses'].iloc[0] |
|
|
model_data['knowledge_base'] = eval(kb_str) if kb_str != '{}' else {} |
|
|
except: |
|
|
model_data['knowledge_base'] = {} |
|
|
|
|
|
|
|
|
try: |
|
|
if len(metadata_parts) > 2: |
|
|
model_data['semantic_categories'] = eval(metadata_parts[2]) if metadata_parts[2] != '{}' else {} |
|
|
else: |
|
|
model_data['semantic_categories'] = {} |
|
|
except: |
|
|
model_data['semantic_categories'] = {} |
|
|
|
|
|
|
|
|
try: |
|
|
if len(metadata_parts) > 3: |
|
|
model_data['grammar_rules'] = eval(metadata_parts[3]) if metadata_parts[3] != '[]' else [] |
|
|
else: |
|
|
model_data['grammar_rules'] = [] |
|
|
except: |
|
|
model_data['grammar_rules'] = [] |
|
|
|
|
|
|
|
|
pattern_df = df.iloc[1:].copy() |
|
|
|
|
|
model_data['patterns'] = [p for p in pattern_df['patterns'].tolist() if p] |
|
|
model_data['responses'] = [r for r in pattern_df['responses'].tolist() if r] |
|
|
model_data['weights'] = pattern_df['weights'].tolist() |
|
|
|
|
|
|
|
|
response_templates = [] |
|
|
for template_str in pattern_df['response_templates'].tolist(): |
|
|
try: |
|
|
template = eval(template_str) if template_str not in ['{}', ''] else {} |
|
|
response_templates.append(template) |
|
|
except: |
|
|
response_templates.append({}) |
|
|
|
|
|
model_data['response_templates'] = response_templates |
|
|
|
|
|
else: |
|
|
|
|
|
model_data = { |
|
|
'patterns': [p for p in df['patterns'].tolist() if p], |
|
|
'responses': [r for r in df['responses'].tolist() if r], |
|
|
'weights': df['weights'].tolist(), |
|
|
'confidence': df.get('confidence', pd.Series([0.5])).iloc[0] if 'confidence' in df.columns else 0.5, |
|
|
'keywords': df.get('keywords', pd.Series([''])).iloc[0].split() if 'keywords' in df.columns else [], |
|
|
'training_samples': df.get('training_samples', pd.Series([0])).iloc[0] if 'training_samples' in df.columns else 0, |
|
|
'model_id': model_id, |
|
|
'knowledge_base': {}, |
|
|
'semantic_categories': {}, |
|
|
'response_templates': [], |
|
|
'grammar_rules': [] |
|
|
} |
|
|
|
|
|
return model_data |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error loading optimized model {model_id}: {e}") |
|
|
return None |
|
|
|
|
|
def load_all_models(self) -> List[Dict[str, Any]]: |
|
|
models = [] |
|
|
|
|
|
if not os.path.exists(self.models_dir): |
|
|
return models |
|
|
|
|
|
for filename in os.listdir(self.models_dir): |
|
|
if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"): |
|
|
try: |
|
|
model_id = int(filename.split("_")[2].split(".")[0]) |
|
|
model = self.load_mini_model(model_id) |
|
|
if model: |
|
|
models.append(model) |
|
|
except (ValueError, IndexError): |
|
|
print(f"Warning: Invalid model filename format: {filename}") |
|
|
continue |
|
|
|
|
|
return models |
|
|
|
|
|
def get_model_count(self) -> int: |
|
|
if not os.path.exists(self.models_dir): |
|
|
return 0 |
|
|
|
|
|
count = 0 |
|
|
for filename in os.listdir(self.models_dir): |
|
|
if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"): |
|
|
count += 1 |
|
|
|
|
|
return count |
|
|
|
|
|
def get_next_model_id(self) -> int: |
|
|
if not os.path.exists(self.models_dir): |
|
|
return 1 |
|
|
|
|
|
max_id = 0 |
|
|
for filename in os.listdir(self.models_dir): |
|
|
if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"): |
|
|
try: |
|
|
model_id = int(filename.split("_")[2].split(".")[0]) |
|
|
max_id = max(max_id, model_id) |
|
|
except (ValueError, IndexError): |
|
|
continue |
|
|
|
|
|
return max_id + 1 |
|
|
|
|
|
def delete_model(self, model_id: int) -> bool: |
|
|
filename = f"AgGPT_Expert_{model_id:04d}.feather" |
|
|
filepath = os.path.join(self.models_dir, filename) |
|
|
|
|
|
if os.path.exists(filepath): |
|
|
try: |
|
|
os.remove(filepath) |
|
|
print(f"Deleted model: {filename}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"Error deleting model {model_id}: {e}") |
|
|
return False |
|
|
|
|
|
return False |
|
|
|
|
|
def clear_all_models(self) -> int: |
|
|
if not os.path.exists(self.models_dir): |
|
|
return 0 |
|
|
|
|
|
deleted_count = 0 |
|
|
for filename in os.listdir(self.models_dir): |
|
|
if filename.startswith("AgGPT_Expert_") and filename.endswith(".feather"): |
|
|
try: |
|
|
os.remove(os.path.join(self.models_dir, filename)) |
|
|
deleted_count += 1 |
|
|
except Exception as e: |
|
|
print(f"Error deleting {filename}: {e}") |
|
|
|
|
|
print(f"Deleted {deleted_count} model files") |
|
|
return deleted_count |
|
|
|
|
|
|
|
|
def similarity_score(text1: str, text2: str) -> float: |
|
|
"""Enhanced semantic similarity calculation for AgGPT-19""" |
|
|
if not text1 or not text2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
text1_clean = text1.lower().strip() |
|
|
text2_clean = text2.lower().strip() |
|
|
|
|
|
|
|
|
if text1_clean == text2_clean: |
|
|
return 1.0 |
|
|
|
|
|
|
|
|
char_sim = _character_similarity(text1_clean, text2_clean) |
|
|
|
|
|
|
|
|
words1 = set(text1_clean.split()) |
|
|
words2 = set(text2_clean.split()) |
|
|
|
|
|
if not words1 or not words2: |
|
|
return char_sim * 0.3 |
|
|
|
|
|
|
|
|
intersection = len(words1.intersection(words2)) |
|
|
union = len(words1.union(words2)) |
|
|
jaccard = intersection / union if union > 0 else 0.0 |
|
|
|
|
|
|
|
|
semantic_sim = _semantic_word_similarity(words1, words2) |
|
|
|
|
|
|
|
|
ngram_sim = _ngram_similarity(text1_clean, text2_clean) |
|
|
|
|
|
|
|
|
len1, len2 = len(text1_clean.split()), len(text2_clean.split()) |
|
|
length_penalty = 1.0 - min(abs(len1 - len2) / max(len1, len2, 1), 0.5) |
|
|
|
|
|
|
|
|
final_score = ( |
|
|
jaccard * 0.4 + |
|
|
semantic_sim * 0.3 + |
|
|
ngram_sim * 0.2 + |
|
|
char_sim * 0.1 |
|
|
) * length_penalty |
|
|
|
|
|
return min(final_score, 1.0) |
|
|
|
|
|
|
|
|
def _character_similarity(text1: str, text2: str) -> float: |
|
|
"""Calculate character-level similarity using longest common subsequence""" |
|
|
if not text1 or not text2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
len1, len2 = len(text1), len(text2) |
|
|
dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] |
|
|
|
|
|
for i in range(1, len1 + 1): |
|
|
for j in range(1, len2 + 1): |
|
|
if text1[i-1] == text2[j-1]: |
|
|
dp[i][j] = dp[i-1][j-1] + 1 |
|
|
else: |
|
|
dp[i][j] = max(dp[i-1][j], dp[i][j-1]) |
|
|
|
|
|
lcs_length = dp[len1][len2] |
|
|
return (2.0 * lcs_length) / (len1 + len2) if (len1 + len2) > 0 else 0.0 |
|
|
|
|
|
|
|
|
def _semantic_word_similarity(words1: set, words2: set) -> float: |
|
|
"""Calculate semantic similarity between word sets""" |
|
|
if not words1 or not words2: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
synonyms = { |
|
|
'hello': {'hi', 'hey', 'greetings', 'good morning', 'good afternoon'}, |
|
|
'thanks': {'thank you', 'appreciate', 'grateful'}, |
|
|
'yes': {'yeah', 'yep', 'sure', 'absolutely', 'definitely'}, |
|
|
'no': {'nope', 'negative', 'not really'}, |
|
|
'good': {'great', 'excellent', 'wonderful', 'amazing', 'fantastic'}, |
|
|
'bad': {'terrible', 'awful', 'horrible', 'poor'}, |
|
|
'big': {'large', 'huge', 'enormous', 'massive'}, |
|
|
'small': {'little', 'tiny', 'mini', 'minute'}, |
|
|
} |
|
|
|
|
|
|
|
|
semantic_matches = 0 |
|
|
total_comparisons = 0 |
|
|
|
|
|
for word1 in words1: |
|
|
for word2 in words2: |
|
|
total_comparisons += 1 |
|
|
|
|
|
|
|
|
if word1 == word2: |
|
|
semantic_matches += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
for key, synonym_set in synonyms.items(): |
|
|
if word1 in synonym_set and word2 in synonym_set: |
|
|
semantic_matches += 0.8 |
|
|
break |
|
|
elif (word1 == key and word2 in synonym_set) or (word2 == key and word1 in synonym_set): |
|
|
semantic_matches += 0.9 |
|
|
break |
|
|
|
|
|
|
|
|
if len(word1) > 3 and len(word2) > 3: |
|
|
if word1 in word2 or word2 in word1: |
|
|
semantic_matches += 0.6 |
|
|
elif word1[:3] == word2[:3]: |
|
|
semantic_matches += 0.4 |
|
|
|
|
|
return semantic_matches / total_comparisons if total_comparisons > 0 else 0.0 |
|
|
|
|
|
|
|
|
def _ngram_similarity(text1: str, text2: str, n: int = 3) -> float: |
|
|
"""Calculate n-gram similarity for character patterns""" |
|
|
if len(text1) < n or len(text2) < n: |
|
|
return 0.0 |
|
|
|
|
|
ngrams1 = set(text1[i:i+n] for i in range(len(text1) - n + 1)) |
|
|
ngrams2 = set(text2[i:i+n] for i in range(len(text2) - n + 1)) |
|
|
|
|
|
if not ngrams1 or not ngrams2: |
|
|
return 0.0 |
|
|
|
|
|
intersection = len(ngrams1.intersection(ngrams2)) |
|
|
union = len(ngrams1.union(ngrams2)) |
|
|
|
|
|
return intersection / union if union > 0 else 0.0 |
|
|
|
|
|
|
|
|
def calculate_confidence_score(patterns: List[str], responses: List[str]) -> float: |
|
|
if not patterns or not responses or len(patterns) != len(responses): |
|
|
return 0.1 |
|
|
|
|
|
base_confidence = min(0.9, len(patterns) / 10.0) |
|
|
|
|
|
return max(0.1, min(1.0, base_confidence)) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
manager = FeatherManager() |
|
|
|
|
|
test_model = { |
|
|
'patterns': ['hello', 'hi', 'hey'], |
|
|
'responses': ['Hello! How can I help you?', 'Hi there!', 'Hey! What\'s up?'], |
|
|
'weights': [1.0, 0.9, 0.8], |
|
|
'confidence': 0.8, |
|
|
'grammar_rules': ['capitalize_first_word', 'end_with_punctuation'], |
|
|
'keywords': ['greeting', 'hello', 'hi'], |
|
|
'training_samples': 150 |
|
|
} |
|
|
|
|
|
model_id = manager.get_next_model_id() |
|
|
manager.save_mini_model(test_model, model_id) |
|
|
loaded_model = manager.load_mini_model(model_id) |
|
|
|
|
|
print(f"Original model: {test_model}") |
|
|
print(f"Loaded model: {loaded_model}") |
|
|
print(f"Models count: {manager.get_model_count()}") |
|
|
|