| import os |
| import glob |
| import shutil |
| from pathlib import Path |
| from langchain_community.document_loaders import DirectoryLoader, TextLoader |
| from langchain_text_splitters import RecursiveCharacterTextSplitter |
| from langchain_huggingface import HuggingFaceEmbeddings |
| from chromadb import PersistentClient |
| from dotenv import load_dotenv |
|
|
| load_dotenv(override=True) |
|
|
| DB_NAME = str(Path(__file__).parent.parent / "vector_db") |
| KNOWLEDGE_BASE = str(Path(__file__).parent.parent / "knowledge-base") |
| collection_name = "docs" |
|
|
| embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B", model_kwargs={"trust_remote_code": True}) |
|
|
|
|
| def fetch_documents(): |
| folders = glob.glob(str(Path(KNOWLEDGE_BASE) / "*")) |
| documents = [] |
| for folder in folders: |
| doc_type = os.path.basename(folder) |
| loader = DirectoryLoader( |
| folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"} |
| ) |
| folder_docs = loader.load() |
| for doc in folder_docs: |
| doc.metadata["doc_type"] = doc_type |
| documents.append(doc) |
| return documents |
|
|
|
|
| def create_chunks(documents): |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200) |
| return text_splitter.split_documents(documents) |
|
|
|
|
| def create_embeddings(chunks): |
| |
| if os.path.exists(DB_NAME): |
| shutil.rmtree(DB_NAME) |
|
|
| chroma = PersistentClient(path=DB_NAME) |
| collection = chroma.get_or_create_collection(collection_name) |
|
|
| texts = [chunk.page_content for chunk in chunks] |
| metas = [chunk.metadata for chunk in chunks] |
| vectors = embeddings.embed_documents(texts) |
| ids = [str(i) for i in range(len(chunks))] |
|
|
| collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas) |
|
|
| count = collection.count() |
| dimensions = len(vectors[0]) |
| print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store") |
|
|
|
|
| if __name__ == "__main__": |
| documents = fetch_documents() |
| chunks = create_chunks(documents) |
| create_embeddings(chunks) |
| print("Ingestion complete") |