huzaifa-dangote's picture
Upload folder using huggingface_hub
0876544 verified
import os
import glob
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
load_dotenv(override=True)
MODEL = "gpt-4.1-mini"
DB_NAME = str(Path(__file__).parent.parent / "vector_db")
KNOWLEDGE_BASE_PATH = str(Path(__file__).parent.parent / "knowledge-base")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
def fetch_documents():
folders = glob.glob(str(Path(KNOWLEDGE_BASE_PATH) / "*"))
documents = []
for folder in folders:
doc_type = os.path.basename(folder)
loader = DirectoryLoader(folder, glob="*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
folder_docs = loader.load()
for doc in folder_docs:
doc.metadata["doc_type"] = doc_type
documents.append(doc)
return documents
def create_chunks(documents):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
return chunks
def create_embeddings(chunks):
if os.path.exists(DB_NAME):
Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()
vectorestore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory=DB_NAME,
)
collection = vectorestore._collection
count = collection.count()
sample_embeddings = collection.get(include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embeddings)
print(f"There are {count} vectors in the vector store with {dimensions} dimensions")
return vectorestore
if __name__ == "__main__":
documents = fetch_documents()
chunks = create_chunks(documents)
create_embeddings(chunks)
print("Ingestion complete")