Finsolve / embed.py
Shoaib-33's picture
Implemenation added
bdb1957
import os
import hashlib
import shutil
import pandas as pd
from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
# -------------------------------
# Configuration
# -------------------------------
BASE_DIR = "resources/data"
CHROMA_DIR = "chroma_db"
# Known departments
DEPARTMENTS = ["engineering", "finance", "general", "hr", "marketing"]
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Markdown/CSV-aware splitter β€” respects heading boundaries
md_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n## ", "\n### ", "\n\n", "\n", " "]
)
# -------------------------------
# Deduplication
# -------------------------------
seen_hashes = set()
def get_hash(text: str) -> str:
return hashlib.md5(text.strip().encode()).hexdigest()
def deduplicate(docs: list) -> list:
unique = []
for doc in docs:
h = get_hash(doc.page_content)
if h not in seen_hashes:
seen_hashes.add(h)
unique.append(doc)
return unique
# -------------------------------
# Main ingestion loop
# -------------------------------
all_split_docs = []
for department in DEPARTMENTS:
dept_path = os.path.join(BASE_DIR, department)
if not os.path.isdir(dept_path):
print(f"⚠️ Folder not found, skipping: {dept_path}")
continue
print(f"\nπŸ” Processing: {department}")
dept_docs = []
for file in sorted(os.listdir(dept_path)):
file_path = os.path.join(dept_path, file)
file_ext = os.path.splitext(file)[-1].lower()
# -------------------------------
# Handle CSV files
# -------------------------------
if file_ext == ".csv":
print(f" πŸ“„ Loading CSV for embedding: {file}")
try:
df = pd.read_csv(file_path)
for _, row in df.iterrows():
# Convert each row into a text document
text = "\n".join([f"{col}: {row[col]}" for col in df.columns])
doc = Document(
page_content=text,
metadata={
"source": file,
"file_type": ".csv",
"role": department.lower(),
"category": department.lower()
}
)
dept_docs.append(doc)
print(f" βœ… Loaded {len(df)} rows from {file}")
except Exception as e:
print(f" ❌ Failed to load CSV {file}: {e}")
continue
# -------------------------------
# Handle Markdown files
# -------------------------------
if file_ext != ".md":
print(f" ⏭️ Skipping unsupported file type: {file}")
continue
try:
try:
loader = UnstructuredFileLoader(file_path)
docs = loader.load()
except Exception:
loader = TextLoader(file_path, encoding="utf-8")
docs = loader.load()
for doc in docs:
doc.metadata["source"] = file
doc.metadata["file_type"] = ".md"
doc.metadata["role"] = department.lower()
doc.metadata["category"] = department.lower()
dept_docs.extend(docs)
print(f" πŸ“„ Loaded: {file} ({len(docs)} doc(s))")
except Exception as e:
print(f" ❌ Failed to load {file}: {e}")
if not dept_docs:
print(f" ⚠️ No documents loaded for: {department}")
continue
# -------------------------------
# Split large documents
# -------------------------------
split_docs = md_splitter.split_documents(dept_docs)
# -------------------------------
# Deduplicate
# -------------------------------
split_docs = deduplicate(split_docs)
all_split_docs.extend(split_docs)
print(f" βœ… {len(split_docs)} unique chunks stored for: {department}")
# -------------------------------
# Build Chroma DB
# -------------------------------
if not all_split_docs:
print("\n❌ No documents to embed. Check your resources/data folders.")
exit(1)
print(f"\nβš™οΈ Building Chroma DB with {len(all_split_docs)} total chunks...")
shutil.rmtree(CHROMA_DIR, ignore_errors=True)
db = Chroma.from_documents(
documents=all_split_docs,
embedding=embedding_model,
persist_directory=CHROMA_DIR,
collection_name="company_docs"
)
# -------------------------------
# Validation Summary
# -------------------------------
stored = db._collection.get()
roles_found = sorted({m.get("role", "?") for m in stored["metadatas"]})
sources_found = sorted({m.get("source", "?") for m in stored["metadatas"]})
print(f"\nπŸŽ‰ Embedding complete!")
print(f" Total chunks : {len(stored['ids'])}")
print(f" Roles indexed : {roles_found}")
print(f" Files indexed : {sources_found}")
print(f"\nπŸ“‹ Sample metadata (first 3):")
for meta in stored["metadatas"][:3]:
print(f" {meta}")