Spaces:
Runtime error
Runtime error
| import os | |
| import sys | |
| import re | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from rag.logger import get_logger # pylint: disable=import-error | |
| logger = get_logger(__name__) | |
| base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| vector_store_path = os.path.join(base_dir, 'data', 'vectorstores') | |
| # ------------------------------------------------- | |
| # PERSONAL INFO CLEANER | |
| # ------------------------------------------------- | |
| def clean_personal_info(text: str) -> str: | |
| patterns = [ | |
| r"\b[\w\.-]+@[\w\.-]+\.\w+\b", | |
| r"\b\d{10}\b", | |
| r"\b(?:\+?\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b", | |
| r"(http|https)://\S+", | |
| r"linkedin\.com/\S+", | |
| r"github\.com/\S+", | |
| r"@[A-Za-z0-9_]+", | |
| r"\d{1,4}\s+\w+\s+(Street|St|Road|Rd|Avenue|Ave|Lane|Ln)", | |
| ] | |
| cleaned = text | |
| for p in patterns: | |
| cleaned = re.sub(p, "[REMOVED]", cleaned, flags=re.IGNORECASE) | |
| return cleaned | |
| # ------------------------------------------------- | |
| # LOAD, CLEAN, PARAGRAPH SPLIT, CHUNK | |
| # ------------------------------------------------- | |
| def load_and_prepare_documents(pdf_path: str): | |
| loader = PyPDFLoader(pdf_path) | |
| pages = loader.load() | |
| processed_docs = [] | |
| paragraph_id = 0 | |
| for page_num, page in enumerate(pages): | |
| # Step 1 — clean personal info | |
| clean_text = clean_personal_info(page.page_content) | |
| # Step 2 — paragraph split | |
| paragraphs = [p.strip() for p in clean_text.split("\n\n") if p.strip()] | |
| for para in paragraphs: | |
| processed_docs.append( | |
| Document( | |
| page_content=para, | |
| metadata={ | |
| "source": pdf_path, | |
| "page": page_num, | |
| "paragraph_id": paragraph_id, | |
| } | |
| ) | |
| ) | |
| paragraph_id += 1 | |
| # Step 3 — chunking (makes it RAG-ready) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=700, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", " ", ""], | |
| ) | |
| final_chunks = text_splitter.split_documents(processed_docs) | |
| logger.info('Loaded pdf data') | |
| return final_chunks | |
| # ------------------------------------------------- | |
| # BUILD VECTORSTORE | |
| # ------------------------------------------------- | |
| def build_vectorstore(docs): | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| vectorstore = FAISS.from_documents(docs, embeddings) | |
| vectorstore.save_local(vector_store_path) | |
| logger.info(f"Vectorstore built with {len(docs)} chunks") | |
| # ------------------------------------------------- | |
| # MAIN PIPELINE | |
| # ------------------------------------------------- | |
| def ingest_resume(pdf_path: str): | |
| logger.info(f"Processing resume: {pdf_path}") | |
| parsed_docs = load_and_prepare_documents(pdf_path) | |
| logger.info(f"Chunks created: {len(parsed_docs)}") | |
| logger.info("Building FAISS index…") | |
| build_vectorstore(parsed_docs) | |
| logger.info("Ingestion complete") | |
| if __name__ == "__main__": | |
| file_path = os.path.join(base_dir, 'data', 'resume_path', 'Raheel_Rehman.pdf') | |
| ingest_resume(pdf_path=file_path) | |
| logger.info("Ingestion Run Successful") | |