File size: 1,386 Bytes
65afe01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# index_builder.py

import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

file_path = "pdf_data.json"
documents = []
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)

try:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        for item in data:
            if "text" in item:
                section = "PPC" if "punishment" in item["text"].lower() or "section" in item["text"].lower() else "other"
                law_type = "criminal" if section == "PPC" else "general"
                chunks = splitter.split_text(item["text"])
                for chunk in chunks:
                    documents.append(Document(
                        page_content=chunk,
                        metadata={"section": section, "law_type": law_type}
                    ))
except Exception as e:
    print(f"❌ Failed to load: {e}")

print(f"βœ… Loaded {len(documents)} chunks with metadata")

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(documents, embedding_model)

# Save index to disk
db.save_local("faiss_index")
print("βœ… FAISS index saved to 'faiss_index/' folder.")