Spaces:
Runtime error
Runtime error
File size: 1,386 Bytes
65afe01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# index_builder.py
import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
file_path = "pdf_data.json"
documents = []
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
for item in data:
if "text" in item:
section = "PPC" if "punishment" in item["text"].lower() or "section" in item["text"].lower() else "other"
law_type = "criminal" if section == "PPC" else "general"
chunks = splitter.split_text(item["text"])
for chunk in chunks:
documents.append(Document(
page_content=chunk,
metadata={"section": section, "law_type": law_type}
))
except Exception as e:
print(f"β Failed to load: {e}")
print(f"β
Loaded {len(documents)} chunks with metadata")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(documents, embedding_model)
# Save index to disk
db.save_local("faiss_index")
print("β
FAISS index saved to 'faiss_index/' folder.")
|