Spaces:
Build error
Build error
| """ | |
| Knowledge utilities to avoid circular imports | |
| """ | |
| import json | |
| import logging | |
| import math | |
| from collections import defaultdict | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, Any, List | |
| logger = logging.getLogger(__name__) | |
| class LocalKnowledgeTool: | |
| """Local fallback knowledge tool when vector store is unavailable""" | |
| def __init__(self, cache_dir: str = "./knowledge_cache"): | |
| self.cache_dir = Path(cache_dir) | |
| self.cache_dir.mkdir(parents=True, exist_ok=True) | |
| self.local_docs = {} | |
| self.inverted_index = defaultdict(set) # word -> doc_ids | |
| self._load_local_docs() | |
| self._build_index() | |
| def _load_local_docs(self): | |
| """Load documents from local cache""" | |
| try: | |
| for file_path in self.cache_dir.glob("*.json"): | |
| with open(file_path, 'r') as f: | |
| doc_data = json.load(f) | |
| self.local_docs[doc_data["id"]] = doc_data | |
| logger.info(f"Loaded {len(self.local_docs)} local documents") | |
| except Exception as e: | |
| logger.warning(f"Failed to load local docs: {e}") | |
| def _build_index(self): | |
| """Build inverted index for better search""" | |
| for doc_id, doc_data in self.local_docs.items(): | |
| text = doc_data.get("text", "").lower() | |
| words = set(text.split()) | |
| for word in words: | |
| # Remove punctuation | |
| word = word.strip('.,!?;:"') | |
| if len(word) > 2: # Skip very short words | |
| self.inverted_index[word].add(doc_id) | |
| def search(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]: | |
| """Improved search using inverted index and TF-IDF-like scoring""" | |
| query_words = set(query.lower().split()) | |
| doc_scores = defaultdict(float) | |
| # Score documents based on word matches | |
| for word in query_words: | |
| word = word.strip('.,!?;:"') | |
| matching_docs = self.inverted_index.get(word, set()) | |
| # IDF-like scoring: rarer words get higher weight | |
| idf = math.log(len(self.local_docs) / (len(matching_docs) + 1)) | |
| for doc_id in matching_docs: | |
| # TF scoring: count occurrences | |
| doc_text = self.local_docs[doc_id].get("text", "").lower() | |
| tf = doc_text.count(word) | |
| doc_scores[doc_id] += tf * idf | |
| # Sort by score | |
| sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True) | |
| # Build results | |
| results = [] | |
| for doc_id, score in sorted_docs[:top_k]: | |
| doc_data = self.local_docs[doc_id] | |
| # Extract relevant snippet | |
| snippet = self._extract_snippet(doc_data.get("text", ""), query) | |
| results.append({ | |
| "id": doc_id, | |
| "text": snippet, | |
| "source": doc_data.get("source", "local"), | |
| "similarity": min(score / 10.0, 1.0), # Normalize score | |
| "full_text": doc_data.get("text", "") | |
| }) | |
| return results | |
| def _extract_snippet(self, text: str, query: str, context_words: int = 50) -> str: | |
| """Extract relevant snippet around query terms""" | |
| text_lower = text.lower() | |
| query_lower = query.lower() | |
| # Find first occurrence of any query word | |
| words = text.split() | |
| query_words = query_lower.split() | |
| best_position = 0 | |
| for i, word in enumerate(words): | |
| if any(qw in word.lower() for qw in query_words): | |
| best_position = i | |
| break | |
| # Extract context around position | |
| start = max(0, best_position - context_words // 2) | |
| end = min(len(words), best_position + context_words // 2) | |
| snippet = " ".join(words[start:end]) | |
| # Add ellipsis if truncated | |
| if start > 0: | |
| snippet = "..." + snippet | |
| if end < len(words): | |
| snippet = snippet + "..." | |
| return snippet | |
| def add_document(self, text: str, source: str = "local") -> str: | |
| """Add document to local cache""" | |
| doc_id = f"local_{len(self.local_docs) + 1}" | |
| doc_data = { | |
| "id": doc_id, | |
| "text": text, | |
| "source": source, | |
| "created_at": datetime.now().isoformat() | |
| } | |
| self.local_docs[doc_id] = doc_data | |
| # Update inverted index | |
| text_lower = text.lower() | |
| words = set(text_lower.split()) | |
| for word in words: | |
| word = word.strip('.,!?;:"') | |
| if len(word) > 2: | |
| self.inverted_index[word].add(doc_id) | |
| # Save to file | |
| file_path = self.cache_dir / f"{doc_id}.json" | |
| with open(file_path, 'w') as f: | |
| json.dump(doc_data, f, indent=2) | |
| return doc_id | |
| def create_local_knowledge_tool() -> LocalKnowledgeTool: | |
| """Create local knowledge tool as fallback""" | |
| return LocalKnowledgeTool() |