|
|
import os |
|
|
import sys |
|
|
import re |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_core.documents import Document |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
from rag.logger import get_logger |
|
|
logger = get_logger(__name__) |
|
|
|
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
|
vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_personal_info(text: str) -> str: |
|
|
patterns = [ |
|
|
r"\b[\w\.-]+@[\w\.-]+\.\w+\b", |
|
|
r"\b\d{10}\b", |
|
|
r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", |
|
|
r"(http|https)://\S+", |
|
|
r"linkedin\.com/\S+", |
|
|
r"github\.com/\S+", |
|
|
r"@[A-Za-z0-9_]+", |
|
|
r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", |
|
|
] |
|
|
|
|
|
cleaned = text |
|
|
for p in patterns: |
|
|
cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) |
|
|
|
|
|
return cleaned |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_and_prepare_documents(pdf_path: str): |
|
|
loader = PyPDFLoader(pdf_path) |
|
|
pages = loader.load() |
|
|
|
|
|
processed_docs = [] |
|
|
paragraph_id = 0 |
|
|
|
|
|
for page_num, page in enumerate(pages): |
|
|
|
|
|
clean_text = clean_personal_info(page.page_content) |
|
|
|
|
|
|
|
|
paragraphs = [p.strip() for p in clean_text.split("\n\n") if p.strip()] |
|
|
|
|
|
for para in paragraphs: |
|
|
processed_docs.append( |
|
|
Document( |
|
|
page_content=para, |
|
|
metadata={ |
|
|
"source": pdf_path, |
|
|
"page": page_num, |
|
|
"paragraph_id": paragraph_id, |
|
|
} |
|
|
) |
|
|
) |
|
|
paragraph_id += 1 |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=700, |
|
|
chunk_overlap=200, |
|
|
separators=["\n\n", "\n", " ", ""], |
|
|
) |
|
|
|
|
|
final_chunks = text_splitter.split_documents(processed_docs) |
|
|
logger.info('Loaded pdf data') |
|
|
return final_chunks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_vectorstore(docs): |
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2" |
|
|
) |
|
|
|
|
|
vectorstore = FAISS.from_documents(docs, embeddings) |
|
|
|
|
|
vectorstore.save_local(vector_store_path) |
|
|
logger.info(f"Vectorstore built with {len(docs)} chunks") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ingest_resume(pdf_path: str): |
|
|
logger.info(f"Processing resume: {pdf_path}") |
|
|
|
|
|
parsed_docs = load_and_prepare_documents(pdf_path) |
|
|
logger.info(f"Chunks created: {len(parsed_docs)}") |
|
|
|
|
|
logger.info("Building FAISS index…") |
|
|
build_vectorstore(parsed_docs) |
|
|
|
|
|
logger.info("Ingestion complete") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
file_path = os.path.join(base_dir, 'data', 'resume_path', 'Raheel_Rehman.pdf') |
|
|
ingest_resume(pdf_path=file_path) |
|
|
logger.info("Ingestion Run Successful") |
|
|
|