import os from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from dotenv import load_dotenv # Load environment variables load_dotenv() DATA_PATH = "data" SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) DB_FAISS_PATH = os.path.join(SCRIPT_DIR, "vectorstore", "db_faiss") def create_vector_db(): documents = [] # Check if data directory exists if not os.path.exists(DATA_PATH): print(f"Directory {DATA_PATH} not found.") return # Load documents for filename in os.listdir(DATA_PATH): file_path = os.path.join(DATA_PATH, filename) if filename.endswith(".pdf"): loader = PyPDFLoader(file_path) documents.extend(loader.load()) print(f"Loaded {filename}") elif filename.endswith(".txt"): loader = TextLoader(file_path, encoding='utf-8') documents.extend(loader.load()) print(f"Loaded {filename}") if not documents: print("No documents found to ingest.") return # Split text text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(documents) print(f"Split documents into {len(texts)} chunks.") # Create embeddings (using HuggingFace - FREE!) print("Generating embeddings locally with sentence-transformers...") embeddings = HuggingFaceEmbeddings( model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'} ) # Create vector store db = FAISS.from_documents(texts, embeddings) db.save_local(DB_FAISS_PATH) print(f"Vector store saved to {DB_FAISS_PATH}") if __name__ == "__main__": create_vector_db()