Spaces:
Runtime error
Runtime error
| import os | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from langchain.vectorstores import FAISS | |
| # --- Configurations --- | |
| PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file | |
| EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used | |
| VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved | |
| # --- 1. Load the PDF --- | |
| print(f"Loading PDF from: {PDF_PATH}...") | |
| try: | |
| loader = PyPDFLoader(PDF_PATH) | |
| documents = loader.load() | |
| print(f"PDF loaded successfully! Total of {len(documents)} pages.") | |
| except Exception as e: | |
| print(f"Error loading PDF: {e}") | |
| print("Please ensure the PDF file exists and the path is correct.") | |
| exit() # Stop the script if an error occurs | |
| # --- 2. Split the text into chunks --- | |
| print("Splitting text into chunks...") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, # Maximum size of each chunk (in characters) | |
| chunk_overlap=200, # How many characters chunks can overlap (to maintain context) | |
| length_function=len # Function to calculate chunk length | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| print(f"Text split into {len(chunks)} chunks.") | |
| # --- 3. Create Embeddings and Store in FAISS --- | |
| print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...") | |
| embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME) | |
| print("Generating embeddings and creating the FAISS vector database...") | |
| # Create the vector database from the chunks and embeddings | |
| vector_db = FAISS.from_documents(chunks, embeddings) | |
| # --- 4. Save the Vector Database --- | |
| print(f"Saving the vector database to: {VECTOR_DB_PATH}...") | |
| vector_db.save_local(VECTOR_DB_PATH) | |
| print("Vector database created and saved successfully!") | |