import os import sys import re from langchain_community.document_loaders import PyPDFLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from rag.logger import get_logger # pylint: disable=import-error logger = get_logger(__name__) base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') # ------------------------------------------------- # PERSONAL INFO CLEANER # ------------------------------------------------- def clean_personal_info(text: str) -> str: patterns = [ r"\b[\w\.-]+@[\w\.-]+\.\w+\b", r"\b\d{10}\b", r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", r"(http|https)://\S+", r"linkedin\.com/\S+", r"github\.com/\S+", r"@[A-Za-z0-9_]+", r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", ] cleaned = text for p in patterns: cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) return cleaned # ------------------------------------------------- # LOAD, CLEAN, PARAGRAPH SPLIT, CHUNK # ------------------------------------------------- def load_and_prepare_documents(pdf_path: str): loader = PyPDFLoader(pdf_path) pages = loader.load() processed_docs = [] paragraph_id = 0 for page_num, page in enumerate(pages): # Step 1 — clean personal info clean_text = clean_personal_info(page.page_content) # Step 2 — paragraph split paragraphs = [p.strip() for p in clean_text.split("\n\n") if p.strip()] for para in paragraphs: processed_docs.append( Document( page_content=para, metadata={ "source": pdf_path, "page": page_num, "paragraph_id": paragraph_id, } ) ) paragraph_id += 1 # Step 3 — chunking (makes it RAG-ready) text_splitter = RecursiveCharacterTextSplitter( chunk_size=700, chunk_overlap=200, separators=["\n\n", "\n", " ", ""], ) final_chunks = text_splitter.split_documents(processed_docs) logger.info('Loaded pdf data') return final_chunks # ------------------------------------------------- # BUILD VECTORSTORE # ------------------------------------------------- def build_vectorstore(docs): embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) vectorstore = FAISS.from_documents(docs, embeddings) vectorstore.save_local(vector_store_path) logger.info(f"Vectorstore built with {len(docs)} chunks") # ------------------------------------------------- # MAIN PIPELINE # ------------------------------------------------- def ingest_resume(pdf_path: str): logger.info(f"Processing resume: {pdf_path}") parsed_docs = load_and_prepare_documents(pdf_path) logger.info(f"Chunks created: {len(parsed_docs)}") logger.info("Building FAISS index…") build_vectorstore(parsed_docs) logger.info("Ingestion complete") if __name__ == "__main__": file_path = os.path.join(base_dir, 'data', 'resume_path', 'Raheel_Rehman.pdf') ingest_resume(pdf_path=file_path) logger.info("Ingestion Run Successful")