File size: 2,518 Bytes
db7ab00
 
 
2a60ded
db7ab00
c816ae2
 
db7ab00
 
b97a17b
c816ae2
db7ab00
 
b97a17b
 
 
 
 
 
 
 
 
db7ab00
c816ae2
b97a17b
 
 
 
 
 
 
 
 
db7ab00
 
c816ae2
 
db7ab00
c816ae2
 
db7ab00
c816ae2
db7ab00
 
2a60ded
c816ae2
 
2a60ded
db7ab00
c816ae2
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import torch
from sentence_transformers import SentenceTransformer

class AdvancedRAG:
    """Advanced RAG system with Hybrid Search (BM25 + Vector) and Reranking."""
    
    def __init__(self, vector_db_collection, embedder_model='all-MiniLM-L6-v2'):
        self.collection = vector_db_collection
        # Lazy load torch/st only if needed, but keeping here as they are stable
        self.embedder = SentenceTransformer(embedder_model, device='cuda' if torch.cuda.is_available() else 'cpu')
        self.bm25_retriever = None

    def _get_langchain_doc(self):
        try:
            from langchain_core.documents import Document
            return Document
        except:
            class Document: 
                def __init__(self, page_content): self.page_content = page_content
            return Document

    def initialize_bm25(self, texts):
        """Initialize BM25 with a corpus of texts."""
        if not texts: return
        try:
            from langchain_community.retrievers import BM25Retriever
            DocClass = self._get_langchain_doc()
            documents = [DocClass(page_content=t) for t in texts]
            self.bm25_retriever = BM25Retriever.from_documents(documents)
            self.bm25_retriever.k = 5
        except Exception as e:
            print(f"BM25 Initialization Failed (Non-Critical): {e}")

    def hybrid_search(self, query, top_k=5):
        """Perform hybrid search combining vector similarity and keyword matching."""
        # 1. Vector Search
        query_embedding = self.embedder.encode(query).tolist()
        vector_results = self.collection.query(query_embeddings=[query_embedding], n_results=top_k)
        vector_docs = vector_results['documents'][0] if vector_results['documents'] else []
        
        # 2. BM25 Search (if initialized)
        bm25_docs = []
        if self.bm25_retriever:
            try:
                results = self.bm25_retriever.get_relevant_documents(query)
                bm25_docs = [doc.page_content for doc in results]
            except: pass

        # 3. Ensemble (Simple deduplication and ranking)
        combined = list(dict.fromkeys(vector_docs + bm25_docs))
        return combined[:top_k]

    def add_intelligence(self, new_text):
        """Add new information to the brain."""
        embedding = self.embedder.encode(new_text).tolist()
        self.collection.add(
            documents=[new_text],
            embeddings=[embedding],
            ids=[str(hash(new_text))]
        )