File size: 5,197 Bytes
8a682b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Knowledge utilities to avoid circular imports
"""

import json
import logging
import math
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List

logger = logging.getLogger(__name__)

class LocalKnowledgeTool:
    """Local fallback knowledge tool when vector store is unavailable"""
    
    def __init__(self, cache_dir: str = "./knowledge_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.local_docs = {}
        self.inverted_index = defaultdict(set)  # word -> doc_ids
        self._load_local_docs()
        self._build_index()
    
    def _load_local_docs(self):
        """Load documents from local cache"""
        try:
            for file_path in self.cache_dir.glob("*.json"):
                with open(file_path, 'r') as f:
                    doc_data = json.load(f)
                    self.local_docs[doc_data["id"]] = doc_data
            logger.info(f"Loaded {len(self.local_docs)} local documents")
        except Exception as e:
            logger.warning(f"Failed to load local docs: {e}")
    
    def _build_index(self):
        """Build inverted index for better search"""
        for doc_id, doc_data in self.local_docs.items():
            text = doc_data.get("text", "").lower()
            words = set(text.split())
            
            for word in words:
                # Remove punctuation
                word = word.strip('.,!?;:"')
                if len(word) > 2:  # Skip very short words
                    self.inverted_index[word].add(doc_id)
    
    def search(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
        """Improved search using inverted index and TF-IDF-like scoring"""
        query_words = set(query.lower().split())
        doc_scores = defaultdict(float)
        
        # Score documents based on word matches
        for word in query_words:
            word = word.strip('.,!?;:"')
            matching_docs = self.inverted_index.get(word, set())
            
            # IDF-like scoring: rarer words get higher weight
            idf = math.log(len(self.local_docs) / (len(matching_docs) + 1))
            
            for doc_id in matching_docs:
                # TF scoring: count occurrences
                doc_text = self.local_docs[doc_id].get("text", "").lower()
                tf = doc_text.count(word)
                doc_scores[doc_id] += tf * idf
        
        # Sort by score
        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Build results
        results = []
        for doc_id, score in sorted_docs[:top_k]:
            doc_data = self.local_docs[doc_id]
            
            # Extract relevant snippet
            snippet = self._extract_snippet(doc_data.get("text", ""), query)
            
            results.append({
                "id": doc_id,
                "text": snippet,
                "source": doc_data.get("source", "local"),
                "similarity": min(score / 10.0, 1.0),  # Normalize score
                "full_text": doc_data.get("text", "")
            })
        
        return results
    
    def _extract_snippet(self, text: str, query: str, context_words: int = 50) -> str:
        """Extract relevant snippet around query terms"""
        text_lower = text.lower()
        query_lower = query.lower()
        
        # Find first occurrence of any query word
        words = text.split()
        query_words = query_lower.split()
        
        best_position = 0
        for i, word in enumerate(words):
            if any(qw in word.lower() for qw in query_words):
                best_position = i
                break
        
        # Extract context around position
        start = max(0, best_position - context_words // 2)
        end = min(len(words), best_position + context_words // 2)
        
        snippet = " ".join(words[start:end])
        
        # Add ellipsis if truncated
        if start > 0:
            snippet = "..." + snippet
        if end < len(words):
            snippet = snippet + "..."
        
        return snippet
    
    def add_document(self, text: str, source: str = "local") -> str:
        """Add document to local cache"""
        doc_id = f"local_{len(self.local_docs) + 1}"
        doc_data = {
            "id": doc_id,
            "text": text,
            "source": source,
            "created_at": datetime.now().isoformat()
        }
        
        self.local_docs[doc_id] = doc_data
        
        # Update inverted index
        text_lower = text.lower()
        words = set(text_lower.split())
        for word in words:
            word = word.strip('.,!?;:"')
            if len(word) > 2:
                self.inverted_index[word].add(doc_id)
        
        # Save to file
        file_path = self.cache_dir / f"{doc_id}.json"
        with open(file_path, 'w') as f:
            json.dump(doc_data, f, indent=2)
        
        return doc_id

def create_local_knowledge_tool() -> LocalKnowledgeTool:
    """Create local knowledge tool as fallback"""
    return LocalKnowledgeTool()