import re from pathlib import Path import pdfplumber import fitz # PyMuPDF from langchain_text_splitters import RecursiveCharacterTextSplitter import json # ------------ PDF Extraction ------------ def extract_text_pdf(path: str) -> str: path = Path(path) text = "" # Try pdfplumber (best for text PDFs) try: with pdfplumber.open(path) as pdf: pages = [p.extract_text() or "" for p in pdf.pages] text = "\n\n".join(pages).strip() except: pass # If no text, try PyMuPDF (works on many PDFs) if not text or len(text) < 20: try: doc = fitz.open(path) pages = [page.get_text("text") for page in doc] doc.close() text = "\n\n".join(pages).strip() except: pass return text # ------------ Cleaning ------------ def clean_text(text: str) -> str: if not text: return "" text = text.replace("\r", "\n") text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r"\s+", " ", text) # remove extra spaces return text # ------------ Process All Resumes & Save in One File ------------ def process_folder_to_single_file(folder_path: str, out_file: str = "cleaned_all.txt"): folder = Path(folder_path) pdf_files = list(folder.glob("*.pdf")) if not pdf_files: print(" No PDF resumes found in the folder.") return with open(out_file, "w", encoding="utf-8") as output: for idx, pdf in enumerate(pdf_files): resume_id = idx + 1 print(f"Processing: {pdf.name}") raw = extract_text_pdf(pdf) cleaned_text = clean_text(raw) output.write(f"\n\n") output.write(f"RESUME ID: {resume_id}\n") output.write(f"FILE NAME: {pdf.name}\n") output.write(cleaned_text) output.write("\n\n") print(f"\n All resumes cleaned & stored in: {out_file}") # Chunking using LangChain def chunk_resume(cleaned_text, chunk_size=10000, chunk_overlap=25): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap,#to avoid context loss length_function=len ) chunks = splitter.split_text(cleaned_text) return chunks # Save Chunks to JSON file def save_chunks(chunks, output_path="resume_chunks.json"): data = [{"chunk_id": i+1, "text": chunk} for i, chunk in enumerate(chunks)] with open(output_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=4, ensure_ascii=False) print(f"Saved {len(chunks)} chunks to {output_path}") # ------------ CLI ------------ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Combine cleaned resumes into one file") parser.add_argument("folder", help="Folder containing PDF resumes") parser.add_argument("--json_out", help="Output chunk JSON file", default="resume_chunks.json") parser.add_argument("--job_file", help="Path to job description .txt file", default=None) parser.add_argument("--job_out", help="Output chunk JSON for JD", default="job_description_chunks.json") args = parser.parse_args() folder = Path(args.folder) pdf_files = list(folder.glob("*.pdf")) # --------- Auto-detect resume folder --------- if args.folder: folder = Path(args.folder) else: print(" Detecting resume folder automatically...") folders = [f for f in Path(".").iterdir() if f.is_dir()] folder = None for f in folders: if list(Path(f).glob("*.pdf")): folder = Path(f) print(f" Found folder with resumes: {f}") break if folder is None: print(" No folder containing PDF resumes found.") print("Hint: place resumes in a folder or use --folder ") exit() pdf_files = list(folder.glob("*.pdf")) if not pdf_files: print(f" No PDF files found in folder: {folder}") exit() process_folder_to_single_file(args.folder, "cleaned_all.txt") all_chunks = [] for idx, pdf in enumerate(pdf_files): resume_id = idx + 1 print(f"Processing: {pdf.name}") raw_text = extract_text_pdf(pdf) clean = clean_text(raw_text) chunks = chunk_resume(clean) for i, c in enumerate(chunks): all_chunks.append({ "resume_id": resume_id, "file_name": pdf.name, "chunk_id": i + 1, "text": c }) with open(args.json_out, "w", encoding="utf-8") as f: json.dump(all_chunks, f, indent=4, ensure_ascii=False) print(f"Finished! Saved {len(all_chunks)} chunks to {args.json_out}") # --------- Process Job Description (if provided) --------- if args.job_file: print(f"\nProcessing Job Description File: {args.job_file}") with open(args.job_file, "r", encoding="utf-8") as jf: jd_text = jf.read() jd_clean = clean_text(jd_text) jd_chunks = chunk_resume(jd_clean) jd_chunk_data = [ {"jd_chunk_id": i + 1, "text": chunk} for i, chunk in enumerate(jd_chunks) ] with open(args.job_out, "w", encoding="utf-8") as f: json.dump(jd_chunk_data, f, indent=4, ensure_ascii=False) print(f"Job description chunked into {len(jd_chunks)} parts and saved to {args.job_out}")