Policy-Navigator / index_builder.py
Mishal23's picture
Create index_builder.py
65afe01 verified
# index_builder.py
import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
file_path = "pdf_data.json"
documents = []
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
for item in data:
if "text" in item:
section = "PPC" if "punishment" in item["text"].lower() or "section" in item["text"].lower() else "other"
law_type = "criminal" if section == "PPC" else "general"
chunks = splitter.split_text(item["text"])
for chunk in chunks:
documents.append(Document(
page_content=chunk,
metadata={"section": section, "law_type": law_type}
))
except Exception as e:
print(f"❌ Failed to load: {e}")
print(f"βœ… Loaded {len(documents)} chunks with metadata")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(documents, embedding_model)
# Save index to disk
db.save_local("faiss_index")
print("βœ… FAISS index saved to 'faiss_index/' folder.")