Spaces:
Sleeping
Sleeping
| try: | |
| import cv2 | |
| import numpy as np | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| CV2_AVAILABLE = True | |
| except ImportError: | |
| CV2_AVAILABLE = False | |
| from glob import glob | |
| from pypdf import PdfReader | |
| from tqdm import tqdm | |
| import os | |
| import PyPDF2 | |
| import io | |
| class DirectoryReader: | |
| """ | |
| A class to read and process job description (JD) files and resume files from specified directories. | |
| """ | |
| def __init__(self, path_to_jds, path_to_resumes): | |
| """ | |
| Initializes the DirectoryReader with paths to job descriptions and resumes. | |
| Args: | |
| path_to_jds (str): Path to the directory containing job description files. | |
| path_to_resumes (str): Path to the directory containing resume files. | |
| """ | |
| self.path_to_jds = path_to_jds | |
| self.path_to_resumes = path_to_resumes | |
| self.jd_data = {} | |
| self.resume_data = {} | |
| def read_jd_files(self): | |
| """ | |
| Reads job description files from the specified directory and stores the content in jd_data attribute. | |
| Returns: | |
| dict: A dictionary with job names as keys and the corresponding job descriptions as values. | |
| """ | |
| file_list = glob(self.path_to_jds, recursive=True) | |
| for file in tqdm(file_list): | |
| with open(file, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| data = data.strip().lower() | |
| job_name = file.split("/")[-1].replace(".txt", "") | |
| self.jd_data[job_name] = data | |
| return self.jd_data | |
| def extract_text_from_pdf_path(file_path): | |
| """Extract text from a PDF file given a file path (for batch processing).""" | |
| reader = PdfReader(file_path) | |
| data = "" | |
| for page in reader.pages: | |
| data = data + page.extract_text() + "\n" | |
| data = data.strip().lower() | |
| return data | |
| def extract_text_from_image(self, file): | |
| if not CV2_AVAILABLE: | |
| raise ImportError("OpenCV, numpy, pdf2image, or pytesseract is not installed. Install these packages to use image processing features.") | |
| pages = convert_from_path(file) | |
| extracted_text = [] | |
| for page in pages: | |
| # Step 1: Preprocess the image (deskew) | |
| preprocessed_image = self.deskew(np.array(page)) | |
| # Step 2: Extract text using OCR | |
| text = self.get_text_from_image(preprocessed_image) | |
| extracted_text.append(text) | |
| return "\n".join(extracted_text).strip().lower() | |
| def read_resume_files(self): | |
| """ | |
| Reads resume files from the specified directory and stores the content in resume_data attribute. | |
| If the resume file is a PDF containing images, OCR is used to extract text. | |
| Returns: | |
| dict: A dictionary with resume identifiers as keys and the corresponding resume texts as values. | |
| """ | |
| file_list = glob(self.path_to_resumes, recursive=True) | |
| for file in tqdm(file_list): | |
| file_parts = os.path.normpath(file).split(os.sep) | |
| # The job title would be the name of the directory just before the file name | |
| job_title = file_parts[-2].replace(" ", "_").lower() | |
| # The resume name would be the file name without the extension | |
| resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "") | |
| data = self.extract_text_from_pdf_path(file) | |
| if len(data) > 1: | |
| self.resume_data[job_title + "_" + resume_name] = data | |
| else: # to solve for incorrect startxref pointer(3), since they are images in pdf | |
| self.resume_data[job_title + "_" + resume_name] = self.extract_text_from_image(file) | |
| return self.resume_data | |
| def deskew(image): | |
| if not CV2_AVAILABLE: | |
| raise ImportError("OpenCV is not installed. Install opencv-python to use this feature.") | |
| gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
| gray = cv2.bitwise_not(gray) | |
| coords = np.column_stack(np.where(gray > 0)) | |
| angle = cv2.minAreaRect(coords)[-1] | |
| if angle < -45: | |
| angle = -(90 + angle) | |
| else: | |
| angle = -angle | |
| (h, w) = image.shape[:2] | |
| center = (w // 2, h // 2) | |
| M = cv2.getRotationMatrix2D(center, angle, 1.0) | |
| rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) | |
| return rotated | |
| def get_text_from_image(image): | |
| if not CV2_AVAILABLE: | |
| raise ImportError("Pytesseract is not installed. Install pytesseract to use this feature.") | |
| text = pytesseract.image_to_string(image) | |
| return text | |
| def extract_text_from_pdf(self, pdf_file): | |
| """Extract text from a PDF file uploaded via Streamlit""" | |
| try: | |
| # Check file size | |
| file_size = len(pdf_file.getvalue()) / (1024 * 1024) # Size in MB | |
| if file_size > 5: | |
| return "Error: File size exceeds 5MB limit. Please upload a smaller file." | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.getvalue())) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() or "" # Handle None returns | |
| if not text.strip(): | |
| return "Error: Could not extract text from PDF. The file might be scanned or image-based." | |
| return text | |
| except Exception as e: | |
| return f"Error processing PDF: {str(e)}" | |