Spaces:
Build error
Build error
File size: 5,197 Bytes
8a682b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
"""
Knowledge utilities to avoid circular imports
"""
import json
import logging
import math
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List
logger = logging.getLogger(__name__)
class LocalKnowledgeTool:
"""Local fallback knowledge tool when vector store is unavailable"""
def __init__(self, cache_dir: str = "./knowledge_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.local_docs = {}
self.inverted_index = defaultdict(set) # word -> doc_ids
self._load_local_docs()
self._build_index()
def _load_local_docs(self):
"""Load documents from local cache"""
try:
for file_path in self.cache_dir.glob("*.json"):
with open(file_path, 'r') as f:
doc_data = json.load(f)
self.local_docs[doc_data["id"]] = doc_data
logger.info(f"Loaded {len(self.local_docs)} local documents")
except Exception as e:
logger.warning(f"Failed to load local docs: {e}")
def _build_index(self):
"""Build inverted index for better search"""
for doc_id, doc_data in self.local_docs.items():
text = doc_data.get("text", "").lower()
words = set(text.split())
for word in words:
# Remove punctuation
word = word.strip('.,!?;:"')
if len(word) > 2: # Skip very short words
self.inverted_index[word].add(doc_id)
def search(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
"""Improved search using inverted index and TF-IDF-like scoring"""
query_words = set(query.lower().split())
doc_scores = defaultdict(float)
# Score documents based on word matches
for word in query_words:
word = word.strip('.,!?;:"')
matching_docs = self.inverted_index.get(word, set())
# IDF-like scoring: rarer words get higher weight
idf = math.log(len(self.local_docs) / (len(matching_docs) + 1))
for doc_id in matching_docs:
# TF scoring: count occurrences
doc_text = self.local_docs[doc_id].get("text", "").lower()
tf = doc_text.count(word)
doc_scores[doc_id] += tf * idf
# Sort by score
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
# Build results
results = []
for doc_id, score in sorted_docs[:top_k]:
doc_data = self.local_docs[doc_id]
# Extract relevant snippet
snippet = self._extract_snippet(doc_data.get("text", ""), query)
results.append({
"id": doc_id,
"text": snippet,
"source": doc_data.get("source", "local"),
"similarity": min(score / 10.0, 1.0), # Normalize score
"full_text": doc_data.get("text", "")
})
return results
def _extract_snippet(self, text: str, query: str, context_words: int = 50) -> str:
"""Extract relevant snippet around query terms"""
text_lower = text.lower()
query_lower = query.lower()
# Find first occurrence of any query word
words = text.split()
query_words = query_lower.split()
best_position = 0
for i, word in enumerate(words):
if any(qw in word.lower() for qw in query_words):
best_position = i
break
# Extract context around position
start = max(0, best_position - context_words // 2)
end = min(len(words), best_position + context_words // 2)
snippet = " ".join(words[start:end])
# Add ellipsis if truncated
if start > 0:
snippet = "..." + snippet
if end < len(words):
snippet = snippet + "..."
return snippet
def add_document(self, text: str, source: str = "local") -> str:
"""Add document to local cache"""
doc_id = f"local_{len(self.local_docs) + 1}"
doc_data = {
"id": doc_id,
"text": text,
"source": source,
"created_at": datetime.now().isoformat()
}
self.local_docs[doc_id] = doc_data
# Update inverted index
text_lower = text.lower()
words = set(text_lower.split())
for word in words:
word = word.strip('.,!?;:"')
if len(word) > 2:
self.inverted_index[word].add(doc_id)
# Save to file
file_path = self.cache_dir / f"{doc_id}.json"
with open(file_path, 'w') as f:
json.dump(doc_data, f, indent=2)
return doc_id
def create_local_knowledge_tool() -> LocalKnowledgeTool:
"""Create local knowledge tool as fallback"""
return LocalKnowledgeTool() |