ChatBot / Vector_storing.py
krishnadhulipalla's picture
Updated UI & personal data
102dac3
import os
import re
import json
import hashlib
from pathlib import Path
import sys
# Ensure we can import from backend if running from root
sys.path.append(str(Path(__file__).resolve().parent))
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from backend import config
# === UTILS ===
def hash_text(text: str) -> str:
return hashlib.md5(text.encode()).hexdigest()[:8]
# === MAIN FUNCTION ===
def create_faiss_store(
md_dir: str = str(config.PERSONAL_DATA_DIR),
chunk_size: int = 1000,
chunk_overlap: int = 250,
persist_dir: str = str(config.FAISS_PATH.parent), # Save to parent of specific version
chunk_save_path: str = str(config.CHUNKS_PATH),
min_chunk_chars: int = 50,
):
"""
Reads all .md files in md_dir, splits into chunks, saves chunks to JSON,
and builds a FAISS index with HuggingFace embeddings.
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n# ", "\n## ", "\n### ", "\n#### ", "\n\n", "\n- ", "\n", ". ", " "],
keep_separator=True,
length_function=len, # consider tokenizer-based later
is_separator_regex=False,
)
docs, all_chunks, failed_chunks = [], [], []
# Gather markdown files
md_files = list(Path(md_dir).glob("*.md"))
if not md_files:
print(f"⚠️ No markdown files found in: {md_dir}")
for md_file in md_files:
try:
with open(md_file, "r", encoding="utf-8") as f:
content = f.read().strip()
except Exception as e:
print(f"❌ Failed to read {md_file}: {e}")
continue
if not content:
continue
# NON-DESTRUCTIVE: only insert a space after hashes when missing
# Keeps heading level (##, ###, etc.) and full text
content = re.sub(r'\n(#+)(\S)', r'\n\1 \2', content)
docs.append(
{
"content": content,
"metadata": {
"source": md_file.name,
"header": content.split("\n")[0] if "\n" in content else content,
},
}
)
# Split into chunks and keep them (no LLM enrichment)
for doc in docs:
try:
chunks = splitter.split_text(doc["content"])
except Exception as e:
print(f"❌ Error splitting {doc['metadata']['source']}: {e}")
continue
for i, chunk in enumerate(chunks):
chunk = chunk.strip()
if len(chunk) < min_chunk_chars:
continue
chunk_id = f"{doc['metadata']['source']}_#{i}_{hash_text(chunk)}"
metadata = {
**doc["metadata"],
"chunk_id": chunk_id,
"has_header": chunk.startswith("#"),
"word_count": len(chunk.split()),
}
header = doc["metadata"]["header"]
chunk = f"[HEADER] {header}\n\n{chunk}"
# Keep raw chunk (no summaries / questions)
all_chunks.append({"text": chunk, "metadata": metadata})
print(f"βœ… Markdown files processed: {len(docs)}")
print(f"βœ… Chunks created: {len(all_chunks)} | ⚠️ Failed: {len(failed_chunks)}")
# Ensure output dir exists and save raw chunks JSON
os.makedirs(os.path.dirname(chunk_save_path), exist_ok=True)
with open(chunk_save_path, "w", encoding="utf-8") as f:
json.dump(all_chunks, f, indent=2, ensure_ascii=False)
print(f"πŸ“ Saved chunks β†’ {chunk_save_path}")
# If nothing to index, stop here
if not all_chunks:
print("⚠️ No chunks to index. Skipping FAISS build.")
return
# Prepare FAISS save path
os.makedirs(persist_dir, exist_ok=True)
version_tag = f"v{len(all_chunks)}_{chunk_size}-{chunk_overlap}"
save_path = os.path.join(persist_dir, version_tag)
os.makedirs(save_path, exist_ok=True)
# Embeddings + FAISS
if config.USE_OPENAI_EMBEDDING:
print(f"πŸ”Ή Using OpenAI Embeddings: {config.EMBEDDING_MODEL_NAME}")
embeddings = OpenAIEmbeddings(
model=config.EMBEDDING_MODEL_NAME,
openai_api_key=config.OPENAI_API_KEY
)
else:
print(f"πŸ”Ή Using HuggingFace Embeddings: {config.EMBEDDING_MODEL_NAME}")
embeddings = HuggingFaceEmbeddings(
model_name=config.EMBEDDING_MODEL_NAME,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True},
)
vector_store = FAISS.from_texts(
texts=[c["text"] for c in all_chunks],
embedding=embeddings,
metadatas=[c["metadata"] for c in all_chunks],
)
vector_store.save_local(save_path)
print(f"βœ… FAISS index saved at: {save_path}")
avg_len = sum(len(c["text"]) for c in all_chunks) / len(all_chunks)
print(f"πŸ“Š Stats β†’ Chunks: {len(all_chunks)} | Avg length: {avg_len:.1f} characters")
if failed_chunks:
with open(config.FAILED_CHUNKS_PATH, "w", encoding="utf-8") as f:
for line in failed_chunks:
f.write(line + "\n")
print("πŸ“ Failed chunk IDs saved to failed_chunks.txt")
if __name__ == "__main__":
create_faiss_store()