# index_builder.py import json import os from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.schema import Document file_path = "pdf_data.json" documents = [] splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50) try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) for item in data: if "text" in item: section = "PPC" if "punishment" in item["text"].lower() or "section" in item["text"].lower() else "other" law_type = "criminal" if section == "PPC" else "general" chunks = splitter.split_text(item["text"]) for chunk in chunks: documents.append(Document( page_content=chunk, metadata={"section": section, "law_type": law_type} )) except Exception as e: print(f"❌ Failed to load: {e}") print(f"✅ Loaded {len(documents)} chunks with metadata") embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") db = FAISS.from_documents(documents, embedding_model) # Save index to disk db.save_local("faiss_index") print("✅ FAISS index saved to 'faiss_index/' folder.")