Spaces:
Sleeping
Sleeping
File size: 1,813 Bytes
c5fe829 d10f0b0 c5fe829 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import os
import glob
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
# Load environment variables
load_dotenv()
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, "data")
DB_PATH = os.path.join(BASE_DIR, "vector_db")
def load_documents():
documents = []
pdf_files = glob.glob(os.path.join(DATA_PATH, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {DATA_PATH}")
return []
print(f"Found {len(pdf_files)} PDF files.")
for pdf_file in pdf_files:
print(f"Loading {pdf_file}...")
try:
loader = PyPDFLoader(pdf_file)
docs = loader.load()
documents.extend(docs)
except Exception as e:
print(f"Error loading {pdf_file}: {e}")
return documents
def split_documents(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
return chunks
def save_to_faiss(chunks):
embeddings = OpenAIEmbeddings()
print("Creating vector database...")
db = FAISS.from_documents(chunks, embeddings)
db.save_local(DB_PATH)
print(f"Saved {len(chunks)} chunks to {DB_PATH}.")
def main():
documents = load_documents()
if not documents:
return
chunks = split_documents(documents)
save_to_faiss(chunks)
if __name__ == "__main__":
main()
|