File size: 2,358 Bytes
72372f3
 
 
 
d6910f6
 
 
72372f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import glob
from typing import List, Dict, Any
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Configuration
DATA_DIR = "./data"
CHROMA_DIR = "./chroma_db"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

class RAGSystem:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        self.vectorstore = None
        self._initialize_db()

    def _initialize_db(self):
        """Initializes or loads the ChromaDB with data from DATA_DIR."""
        if not os.path.exists(CHROMA_DIR) or not os.listdir(CHROMA_DIR):
            print("Initializing new vector database from PDFs...")
            pdf_files = glob.glob(os.path.join(DATA_DIR, "*.pdf"))
            all_docs = []
            
            for pdf in pdf_files:
                try:
                    loader = PyPDFLoader(pdf)
                    docs = loader.load()
                    all_docs.extend(docs)
                except Exception as e:
                    print(f"Error loading {pdf}: {e}")
            
            if all_docs:
                splits = self.text_splitter.split_documents(all_docs)
                self.vectorstore = Chroma.from_documents(
                    documents=splits,
                    embedding=self.embeddings,
                    persist_directory=CHROMA_DIR
                )
                print(f"Indexed {len(splits)} chunks.")
            else:
                self.vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=self.embeddings)
        else:
            print("Loading existing vector database...")
            self.vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=self.embeddings)

    def query(self, text: str, k: int = 5) -> str:
        """Queries the vector database and returns a combined context string."""
        if not self.vectorstore:
            return ""
        
        results = self.vectorstore.similarity_search(text, k=k)
        context = "\n\n".join([doc.page_content for doc in results])
        return context

# Singleton instance
rag_system = RAGSystem()