Multimodal_CV / src /vectorstore.py
Al1Abdullah's picture
Update src/vectorstore.py
1fc7fa5 verified
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.indexes import SQLRecordManager, index
from src.pdf_handler import extract_pdf, load_pdf_directory, split_pdf
from src.utils import load_config
import os
def setup_chroma(index_name, embedding_model, persist_directory=None):
if not persist_directory:
persist_directory = './.cache/database'
os.makedirs(persist_directory, exist_ok=True)
db = Chroma(index_name, embedding_function=embedding_model, persist_directory=persist_directory)
return db
class VectorDB:
def __init__(self, index_name=None, cache_dir=None):
if index_name is None:
index_name = 'default'
embedding = OllamaEmbeddings(model='nomic-embed-text:latest', num_gpu=1)
if not cache_dir:
cache_dir = './.cache/database'
self.cache_dir = cache_dir
os.makedirs(self.cache_dir, exist_ok=True)
self.vectorstore = setup_chroma(index_name, embedding, self.cache_dir)
namespace = f'chroma/{index_name}'
self.record_manager = SQLRecordManager(namespace,
db_url=f'sqlite:///{self.cache_dir}/record_manager_cache.sql')
self.record_manager.create_schema()
def index(self, uploaded_file):
directory = extract_pdf(uploaded_file)
docs = load_pdf_directory(directory)
chunks = split_pdf(docs)
index(
docs_source=chunks,
record_manager=self.record_manager,
vector_store=self.vectorstore,
cleanup='full',
source_id_key='source'
)
for file in os.listdir(directory):
os.remove(os.path.join(directory, file))
def as_retriever(self):
return self.vectorstore.as_retriever()