Spaces:
Runtime error
Runtime error
| import ray | |
| import logging | |
| import os | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| # Initialize Ray (safe even if already running) | |
| ray.init(ignore_reinit_error=True) | |
| # Logging setup | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| # Define FAISS index paths | |
| index_directory = 'ipc_embed_db' | |
| index_path_faiss = os.path.join(index_directory, 'index.faiss') | |
| index_path_pkl = os.path.join(index_directory, 'index.pkl') | |
| # Ensure index directory exists | |
| os.makedirs(index_directory, exist_ok=True) | |
| # Load documents | |
| logging.info("π Loading legal documents from 'data/' directory...") | |
| loader = DirectoryLoader('data', glob="**/*.txt") # Recursively load .txt files | |
| documents = loader.load() | |
| # Check if any documents were found | |
| if not documents: | |
| logging.error("β No documents found in 'data/'. Please add .txt files to proceed.") | |
| ray.shutdown() | |
| exit() | |
| # Split documents into chunks | |
| logging.info("βοΈ Splitting documents for embedding...") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) | |
| texts = text_splitter.split_documents(documents) | |
| # Load the InLegalBERT embedding model | |
| logging.info("π¦ Loading HuggingFace embedding model: 'law-ai/InLegalBERT'...") | |
| embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT") | |
| # Create and save the FAISS index | |
| def create_faiss_index(): | |
| logging.info("βοΈ Creating new FAISS index...") | |
| faiss_db = FAISS.from_documents(texts, embeddings) | |
| faiss_db.save_local(index_directory) | |
| logging.info("β FAISS index saved in '%s'.", index_directory) | |
| return faiss_db | |
| # Load existing index or create if missing | |
| def load_or_create_faiss_index(): | |
| if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl): | |
| logging.info("π Loading existing FAISS index...") | |
| try: | |
| faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True) | |
| logging.info("β FAISS index loaded successfully.") | |
| return faiss_db | |
| except Exception as e: | |
| logging.warning("β οΈ Failed to load existing index. Recreating... (%s)", str(e)) | |
| else: | |
| logging.info("β FAISS index files not found. Creating new index...") | |
| return create_faiss_index() | |
| # Build the index | |
| faiss_db = load_or_create_faiss_index() | |
| # Optional: if you want to use the retriever later | |
| # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3}) | |
| # Shut down Ray | |
| ray.shutdown() | |
| logging.info("β Indexing process completed successfully.") | |