import os import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import NearestNeighbors import logging from typing import List, Dict logger = logging.getLogger(__name__) class HistoricalMemoryLayer: """ Historical Memory Layer using Retrieval-Augmented Generation (RAG) concepts. Stores successfully resolved past tickets. When a new ambiguous ticket arrives, it retrieves the K nearest historical tickets. This can be used to dynamically boost confidence or suggest resolutions. """ def __init__(self, data_path: str = None): if data_path is None: base = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) data_path = os.path.join(base, 'data', 'processed', 'train.csv') self.data_path = data_path self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) self.nn_model = NearestNeighbors(n_neighbors=5, metric='cosine') self.memory_df = None self.is_ready = False self._load_memory() def _load_memory(self): try: if not os.path.exists(self.data_path): logger.warning(f"[HistoricalMemory] Data file not found at {self.data_path}") return self.memory_df = pd.read_csv(self.data_path) # Ensure required columns exist if 'text' not in self.memory_df.columns or 'category' not in self.memory_df.columns: logger.warning("[HistoricalMemory] Required columns ('text', 'category') missing.") return # Fit TF-IDF and Nearest Neighbors logger.info(f"[HistoricalMemory] Indexing {len(self.memory_df)} historical tickets...") X = self.vectorizer.fit_transform(self.memory_df['text'].fillna('')) self.nn_model.fit(X) self.is_ready = True logger.info("[HistoricalMemory] Indexing complete.") except Exception as e: logger.error(f"[HistoricalMemory] Failed to load memory: {e}") def retrieve_similar(self, query_text: str, k: int = 3) -> List[Dict]: """ Retrieve top K similar historical tickets. """ if not self.is_ready: return [] # Vectorize query X_query = self.vectorizer.transform([query_text]) # Search distances, indices = self.nn_model.kneighbors(X_query, n_neighbors=k) results = [] for dist, idx in zip(distances[0], indices[0]): # Cosine distance to similarity score similarity = 1.0 - dist row = self.memory_df.iloc[idx] results.append({ 'text': row['text'], 'category': row['category'], 'similarity': round(similarity, 4) }) return results def compute_historical_boost(self, query_text: str, candidate_category: str, k: int = 5) -> float: """ Calculate a confidence boost if the most similar past tickets were resolved in the same candidate category. """ if not self.is_ready: return 0.0 similar_tickets = self.retrieve_similar(query_text, k=k) if not similar_tickets: return 0.0 # Count how many of the top-k match the candidate category, weighted by similarity boost = 0.0 total_weight = 0.0 for t in similar_tickets: weight = t['similarity'] total_weight += weight if t['category'] == candidate_category: boost += weight if total_weight == 0: return 0.0 match_ratio = boost / total_weight # Max boost is 0.15 (15%) return round(match_ratio * 0.15, 4) if __name__ == "__main__": logging.basicConfig(level=logging.INFO) memory = HistoricalMemoryLayer() test_queries = [ "My invoice from last month is incorrect, please fix the billing.", "The API keeps returning 500 errors since last Tuesday's update.", "How do I add another user to our account?" ] for q in test_queries: print(f"\nQuery: '{q}'") results = memory.retrieve_similar(q, k=2) for r in results: print(f" -> [{r['category']}] (sim: {r['similarity']:.2f}) {r['text']}") boost = memory.compute_historical_boost(q, "billing") print(f"Historical boost for 'billing': +{boost}")