import os import re from dotenv import load_dotenv from langchain_core.documents import Document from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS # Load environment variables from .env file load_dotenv() # Define the path for the FAISS vector store DB_FAISS_PATH = 'vectorStore' def clean_text(text): """Clean messy headers/footers and normalize spacing.""" text = re.sub(r'\n\s*\n', '\n\n', text) # collapse multiple newlines lines = text.split('\n') cleaned_lines = [] for line in lines: if sum(c.isalpha() for c in line) > 5: # keep if more than 5 letters cleaned_lines.append(line) text = '\n'.join(cleaned_lines) text = re.sub(r'\s+', ' ', text).strip() # normalize spaces return text def load_documents(): """Manually load PDF and text documents from the 'data/' folder with proper encoding.""" data_dir = '../data' documents = [] for root, _, files in os.walk(data_dir): for file in files: file_path = os.path.join(root, file) if file.lower().endswith('.pdf'): loader = PyPDFLoader(file_path) print(f"Loading PDF {file_path}") documents.extend(loader.load()) elif file.lower().endswith('.txt'): print(f"Loading TXT {file_path}") try: with open(file_path, 'r', encoding='utf-8') as f: text = f.read() documents.append(Document(page_content=text, metadata={"source": file_path})) except UnicodeDecodeError as e: print(f"⚠ Skipping {file_path} due to encoding error: {e}") else: continue return documents def create_vector_db(): print("Step 1: Loading documents from the 'data/' directory...") documents = load_documents() if not documents: print("No documents found in the 'data' directory. Exiting.") return print(f"Loaded {len(documents)} document(s).") print("\nStep 2: Cleaning the text content...") for doc in documents: doc.page_content = clean_text(doc.page_content) print("Text cleaning complete.") print("\nStep 3: Splitting into chunks...") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100 ) chunks = text_splitter.split_documents(documents) print(f"Created {len(chunks)} chunks.") print("\nStep 4: Creating embeddings with HuggingFace...") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") print("Step 5: Building FAISS index...") db = FAISS.from_documents(chunks, embeddings) db.save_local(DB_FAISS_PATH) print(f"\n✅ Ingestion complete! Vector store saved at '{DB_FAISS_PATH}'") if __name__ == "__main__": create_vector_db()