Mishal23 commited on
Commit
65afe01
Β·
verified Β·
1 Parent(s): 2621a35

Create index_builder.py

Browse files
Files changed (1) hide show
  1. index_builder.py +37 -0
index_builder.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # index_builder.py
2
+
3
+ import json
4
+ import os
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain.schema import Document
9
+
10
+ file_path = "pdf_data.json"
11
+ documents = []
12
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
13
+
14
+ try:
15
+ with open(file_path, "r", encoding="utf-8") as f:
16
+ data = json.load(f)
17
+ for item in data:
18
+ if "text" in item:
19
+ section = "PPC" if "punishment" in item["text"].lower() or "section" in item["text"].lower() else "other"
20
+ law_type = "criminal" if section == "PPC" else "general"
21
+ chunks = splitter.split_text(item["text"])
22
+ for chunk in chunks:
23
+ documents.append(Document(
24
+ page_content=chunk,
25
+ metadata={"section": section, "law_type": law_type}
26
+ ))
27
+ except Exception as e:
28
+ print(f"❌ Failed to load: {e}")
29
+
30
+ print(f"βœ… Loaded {len(documents)} chunks with metadata")
31
+
32
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
33
+ db = FAISS.from_documents(documents, embedding_model)
34
+
35
+ # Save index to disk
36
+ db.save_local("faiss_index")
37
+ print("βœ… FAISS index saved to 'faiss_index/' folder.")