File size: 4,669 Bytes
2612bdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import logging
from typing import List, Dict

logger = logging.getLogger(__name__)

class HistoricalMemoryLayer:
    """
    Historical Memory Layer using Retrieval-Augmented Generation (RAG) concepts.
    Stores successfully resolved past tickets.
    When a new ambiguous ticket arrives, it retrieves the K nearest historical tickets.
    This can be used to dynamically boost confidence or suggest resolutions.
    """
    def __init__(self, data_path: str = None):
        if data_path is None:
            base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            data_path = os.path.join(base, 'data', 'processed', 'train.csv')
            
        self.data_path = data_path
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
        self.nn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
        self.memory_df = None
        self.is_ready = False
        
        self._load_memory()

    def _load_memory(self):
        try:
            if not os.path.exists(self.data_path):
                logger.warning(f"[HistoricalMemory] Data file not found at {self.data_path}")
                return

            self.memory_df = pd.read_csv(self.data_path)
            
            # Ensure required columns exist
            if 'text' not in self.memory_df.columns or 'category' not in self.memory_df.columns:
                logger.warning("[HistoricalMemory] Required columns ('text', 'category') missing.")
                return

            # Fit TF-IDF and Nearest Neighbors
            logger.info(f"[HistoricalMemory] Indexing {len(self.memory_df)} historical tickets...")
            X = self.vectorizer.fit_transform(self.memory_df['text'].fillna(''))
            self.nn_model.fit(X)
            
            self.is_ready = True
            logger.info("[HistoricalMemory] Indexing complete.")
            
        except Exception as e:
            logger.error(f"[HistoricalMemory] Failed to load memory: {e}")

    def retrieve_similar(self, query_text: str, k: int = 3) -> List[Dict]:
        """
        Retrieve top K similar historical tickets.
        """
        if not self.is_ready:
            return []

        # Vectorize query
        X_query = self.vectorizer.transform([query_text])
        
        # Search
        distances, indices = self.nn_model.kneighbors(X_query, n_neighbors=k)
        
        results = []
        for dist, idx in zip(distances[0], indices[0]):
            # Cosine distance to similarity score
            similarity = 1.0 - dist
            row = self.memory_df.iloc[idx]
            
            results.append({
                'text': row['text'],
                'category': row['category'],
                'similarity': round(similarity, 4)
            })
            
        return results

    def compute_historical_boost(self, query_text: str, candidate_category: str, k: int = 5) -> float:
        """
        Calculate a confidence boost if the most similar past tickets 
        were resolved in the same candidate category.
        """
        if not self.is_ready:
            return 0.0
            
        similar_tickets = self.retrieve_similar(query_text, k=k)
        if not similar_tickets:
            return 0.0
            
        # Count how many of the top-k match the candidate category, weighted by similarity
        boost = 0.0
        total_weight = 0.0
        
        for t in similar_tickets:
            weight = t['similarity']
            total_weight += weight
            if t['category'] == candidate_category:
                boost += weight
                
        if total_weight == 0:
            return 0.0
            
        match_ratio = boost / total_weight
        
        # Max boost is 0.15 (15%)
        return round(match_ratio * 0.15, 4)

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    memory = HistoricalMemoryLayer()
    
    test_queries = [
        "My invoice from last month is incorrect, please fix the billing.",
        "The API keeps returning 500 errors since last Tuesday's update.",
        "How do I add another user to our account?"
    ]
    
    for q in test_queries:
        print(f"\nQuery: '{q}'")
        results = memory.retrieve_similar(q, k=2)
        for r in results:
            print(f" -> [{r['category']}] (sim: {r['similarity']:.2f}) {r['text']}")
            
        boost = memory.compute_historical_boost(q, "billing")
        print(f"Historical boost for 'billing': +{boost}")