import os import hashlib import shutil import pandas as pd from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain_community.vectorstores import Chroma from langchain.schema import Document # ------------------------------- # Configuration # ------------------------------- BASE_DIR = "resources/data" CHROMA_DIR = "chroma_db" # Known departments DEPARTMENTS = ["engineering", "finance", "general", "hr", "marketing"] embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # Markdown/CSV-aware splitter — respects heading boundaries md_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, separators=["\n## ", "\n### ", "\n\n", "\n", " "] ) # ------------------------------- # Deduplication # ------------------------------- seen_hashes = set() def get_hash(text: str) -> str: return hashlib.md5(text.strip().encode()).hexdigest() def deduplicate(docs: list) -> list: unique = [] for doc in docs: h = get_hash(doc.page_content) if h not in seen_hashes: seen_hashes.add(h) unique.append(doc) return unique # ------------------------------- # Main ingestion loop # ------------------------------- all_split_docs = [] for department in DEPARTMENTS: dept_path = os.path.join(BASE_DIR, department) if not os.path.isdir(dept_path): print(f"āš ļø Folder not found, skipping: {dept_path}") continue print(f"\nšŸ” Processing: {department}") dept_docs = [] for file in sorted(os.listdir(dept_path)): file_path = os.path.join(dept_path, file) file_ext = os.path.splitext(file)[-1].lower() # ------------------------------- # Handle CSV files # ------------------------------- if file_ext == ".csv": print(f" šŸ“„ Loading CSV for embedding: {file}") try: df = pd.read_csv(file_path) for _, row in df.iterrows(): # Convert each row into a text document text = "\n".join([f"{col}: {row[col]}" for col in df.columns]) doc = Document( page_content=text, metadata={ "source": file, "file_type": ".csv", "role": department.lower(), "category": department.lower() } ) dept_docs.append(doc) print(f" āœ… Loaded {len(df)} rows from {file}") except Exception as e: print(f" āŒ Failed to load CSV {file}: {e}") continue # ------------------------------- # Handle Markdown files # ------------------------------- if file_ext != ".md": print(f" ā­ļø Skipping unsupported file type: {file}") continue try: try: loader = UnstructuredFileLoader(file_path) docs = loader.load() except Exception: loader = TextLoader(file_path, encoding="utf-8") docs = loader.load() for doc in docs: doc.metadata["source"] = file doc.metadata["file_type"] = ".md" doc.metadata["role"] = department.lower() doc.metadata["category"] = department.lower() dept_docs.extend(docs) print(f" šŸ“„ Loaded: {file} ({len(docs)} doc(s))") except Exception as e: print(f" āŒ Failed to load {file}: {e}") if not dept_docs: print(f" āš ļø No documents loaded for: {department}") continue # ------------------------------- # Split large documents # ------------------------------- split_docs = md_splitter.split_documents(dept_docs) # ------------------------------- # Deduplicate # ------------------------------- split_docs = deduplicate(split_docs) all_split_docs.extend(split_docs) print(f" āœ… {len(split_docs)} unique chunks stored for: {department}") # ------------------------------- # Build Chroma DB # ------------------------------- if not all_split_docs: print("\nāŒ No documents to embed. Check your resources/data folders.") exit(1) print(f"\nāš™ļø Building Chroma DB with {len(all_split_docs)} total chunks...") shutil.rmtree(CHROMA_DIR, ignore_errors=True) db = Chroma.from_documents( documents=all_split_docs, embedding=embedding_model, persist_directory=CHROMA_DIR, collection_name="company_docs" ) # ------------------------------- # Validation Summary # ------------------------------- stored = db._collection.get() roles_found = sorted({m.get("role", "?") for m in stored["metadatas"]}) sources_found = sorted({m.get("source", "?") for m in stored["metadatas"]}) print(f"\nšŸŽ‰ Embedding complete!") print(f" Total chunks : {len(stored['ids'])}") print(f" Roles indexed : {roles_found}") print(f" Files indexed : {sources_found}") print(f"\nšŸ“‹ Sample metadata (first 3):") for meta in stored["metadatas"][:3]: print(f" {meta}")