Raheel Abdul Rehman
Prod Push
b13d185
import os
import sys
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from rag.logger import get_logger # pylint: disable=import-error
logger = get_logger(__name__)
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
vector_store_path = os.path.join(base_dir, 'data', 'vectorstores')
# -------------------------------------------------
# PERSONAL INFO CLEANER
# -------------------------------------------------
def clean_personal_info(text: str) -> str:
patterns = [
r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
r"\b\d{10}\b",
r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",
r"(http|https)://\S+",
r"linkedin\.com/\S+",
r"github\.com/\S+",
r"@[A-Za-z0-9_]+",
r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)",
]
cleaned = text
for p in patterns:
cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE)
return cleaned
# -------------------------------------------------
# LOAD, CLEAN, PARAGRAPH SPLIT, CHUNK
# -------------------------------------------------
def load_and_prepare_documents(pdf_path: str):
loader = PyPDFLoader(pdf_path)
pages = loader.load()
processed_docs = []
paragraph_id = 0
for page_num, page in enumerate(pages):
# Step 1 — clean personal info
clean_text = clean_personal_info(page.page_content)
# Step 2 — paragraph split
paragraphs = [p.strip() for p in clean_text.split("\n\n") if p.strip()]
for para in paragraphs:
processed_docs.append(
Document(
page_content=para,
metadata={
"source": pdf_path,
"page": page_num,
"paragraph_id": paragraph_id,
}
)
)
paragraph_id += 1
# Step 3 — chunking (makes it RAG-ready)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=700,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""],
)
final_chunks = text_splitter.split_documents(processed_docs)
logger.info('Loaded pdf data')
return final_chunks
# -------------------------------------------------
# BUILD VECTORSTORE
# -------------------------------------------------
def build_vectorstore(docs):
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local(vector_store_path)
logger.info(f"Vectorstore built with {len(docs)} chunks")
# -------------------------------------------------
# MAIN PIPELINE
# -------------------------------------------------
def ingest_resume(pdf_path: str):
logger.info(f"Processing resume: {pdf_path}")
parsed_docs = load_and_prepare_documents(pdf_path)
logger.info(f"Chunks created: {len(parsed_docs)}")
logger.info("Building FAISS index…")
build_vectorstore(parsed_docs)
logger.info("Ingestion complete")
if __name__ == "__main__":
file_path = os.path.join(base_dir, 'data', 'resume_path', 'Raheel_Rehman.pdf')
ingest_resume(pdf_path=file_path)
logger.info("Ingestion Run Successful")