Spaces:

ZeeAI1
/

LawFi4

Sleeping

ZeeAI1 commited on Nov 9, 2024

Commit

b3dde21

verified ·

1 Parent(s): 9692911

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import os
+import pdfplumber
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+def preprocess_pdfs(folder_path, save_vectorstore_path):
+    all_text = ""
+    pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
+    for file_path in pdf_files:
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    all_text += page_text
+    if all_text:
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
+        text_chunks = text_splitter.split_text(all_text)
+        embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
+        # Ensure the save directory exists
+        os.makedirs(save_vectorstore_path, exist_ok=True)
+        vector_store.save_local(save_vectorstore_path)
+        print("Data preprocessing and vector store creation completed!")
+# Define your folder paths
+data_folder = 'documents1'  # Replace with the path to your PDFs
+vectorstore_path = 'vector_store_data/faiss_vectorstore'  # Path to save vector store
+# Run preprocessing
+preprocess_pdfs(data_folder, vectorstore_path)