File size: 3,706 Bytes
eff2880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2999c3
eff2880
 
 
 
 
 
 
 
 
 
 
 
 
e2999c3
eff2880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import sys
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from rag.logger import get_logger  # pylint: disable=import-error
logger = get_logger(__name__)

base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
vector_store_path = os.path.join(base_dir, 'data', 'vectorstores')

# -------------------------------------------------
#  PERSONAL INFO CLEANER
# -------------------------------------------------
def clean_personal_info(text: str) -> str:
    patterns = [
        r"\b[\w\.-]+@[\w\.-]+\.\w+\b",    
        r"\b\d{10}\b",                               
        r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",  
        r"(http|https)://\S+",                       
        r"linkedin\.com/\S+",
        r"github\.com/\S+",
        r"@[A-Za-z0-9_]+",                        
        r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", 
    ]

    cleaned = text
    for p in patterns:
        cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE)

    return cleaned

# -------------------------------------------------
#  LOAD, CLEAN, PARAGRAPH SPLIT, CHUNK
# -------------------------------------------------
def load_and_prepare_documents(pdf_path: str):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()

    processed_docs = []
    paragraph_id = 0

    for page_num, page in enumerate(pages):
        # Step 1 — clean personal info
        clean_text = clean_personal_info(page.page_content)

        # Step 2 — paragraph split
        paragraphs = [p.strip() for p in clean_text.split("\n\n") if p.strip()]

        for para in paragraphs:
            processed_docs.append(
                Document(
                    page_content=para,
                    metadata={
                        "source": pdf_path,
                        "page": page_num,
                        "paragraph_id": paragraph_id,
                    }
                )
            )
            paragraph_id += 1

    # Step 3 — chunking (makes it RAG-ready)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""],
    )

    final_chunks = text_splitter.split_documents(processed_docs)
    logger.info('Loaded pdf data')
    return final_chunks

# -------------------------------------------------
#  BUILD VECTORSTORE
# -------------------------------------------------
def build_vectorstore(docs):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    vectorstore = FAISS.from_documents(docs, embeddings)

    vectorstore.save_local(vector_store_path)
    logger.info(f"Vectorstore built with {len(docs)} chunks")

# -------------------------------------------------
#  MAIN PIPELINE
# -------------------------------------------------
def ingest_resume(pdf_path: str):
    logger.info(f"Processing resume: {pdf_path}")

    parsed_docs = load_and_prepare_documents(pdf_path)
    logger.info(f"Chunks created: {len(parsed_docs)}")

    logger.info("Building FAISS index…")
    build_vectorstore(parsed_docs)

    logger.info("Ingestion complete")


if __name__ == "__main__":
    file_path = os.path.join(base_dir, 'data', 'resume_path', 'Raheel_Rehman.pdf')
    ingest_resume(pdf_path=file_path)
    logger.info("Ingestion Run Successful")