from langchain_chroma import Chroma from langchain_community.embeddings import OllamaEmbeddings from langchain.indexes import SQLRecordManager, index from src.pdf_handler import extract_pdf, load_pdf_directory, split_pdf from src.utils import load_config import os def setup_chroma(index_name, embedding_model, persist_directory=None): if not persist_directory: persist_directory = './.cache/database' os.makedirs(persist_directory, exist_ok=True) db = Chroma(index_name, embedding_function=embedding_model, persist_directory=persist_directory) return db class VectorDB: def __init__(self, index_name=None, cache_dir=None): if index_name is None: index_name = 'default' embedding = OllamaEmbeddings(model='nomic-embed-text:latest', num_gpu=1) if not cache_dir: cache_dir = './.cache/database' self.cache_dir = cache_dir os.makedirs(self.cache_dir, exist_ok=True) self.vectorstore = setup_chroma(index_name, embedding, self.cache_dir) namespace = f'chroma/{index_name}' self.record_manager = SQLRecordManager(namespace, db_url=f'sqlite:///{self.cache_dir}/record_manager_cache.sql') self.record_manager.create_schema() def index(self, uploaded_file): directory = extract_pdf(uploaded_file) docs = load_pdf_directory(directory) chunks = split_pdf(docs) index( docs_source=chunks, record_manager=self.record_manager, vector_store=self.vectorstore, cleanup='full', source_id_key='source' ) for file in os.listdir(directory): os.remove(os.path.join(directory, file)) def as_retriever(self): return self.vectorstore.as_retriever()