Spaces:

shekkari21
/

resume-analyzer

Sleeping

File size: 5,688 Bytes

try:
    import cv2
    import numpy as np
    from pdf2image import convert_from_path
    import pytesseract
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False
    
from glob import glob
from pypdf import PdfReader
from tqdm import tqdm
import os
import PyPDF2
import io

class DirectoryReader:
    """
    A class to read and process job description (JD) files and resume files from specified directories.
    """
    def __init__(self, path_to_jds, path_to_resumes):
        """
       Initializes the DirectoryReader with paths to job descriptions and resumes.

       Args:
           path_to_jds (str): Path to the directory containing job description files.
           path_to_resumes (str): Path to the directory containing resume files.
       """
        self.path_to_jds = path_to_jds
        self.path_to_resumes = path_to_resumes
        self.jd_data = {}
        self.resume_data = {}

    def read_jd_files(self):
        """
        Reads job description files from the specified directory and stores the content in jd_data attribute.

        Returns:
            dict: A dictionary with job names as keys and the corresponding job descriptions as values.
        """
        file_list = glob(self.path_to_jds, recursive=True)
        for file in tqdm(file_list):
            with open(file, "r", encoding="utf-8") as f:
                data = f.read()
                data = data.strip().lower()
                job_name = file.split("/")[-1].replace(".txt", "")
                self.jd_data[job_name] = data
        return self.jd_data

    @staticmethod
    def extract_text_from_pdf_path(file_path):
        """Extract text from a PDF file given a file path (for batch processing)."""
        reader = PdfReader(file_path)
        data = ""
        for page in reader.pages:
            data = data + page.extract_text() + "\n"
        data = data.strip().lower()
        return data

    def extract_text_from_image(self, file):
        if not CV2_AVAILABLE:
            raise ImportError("OpenCV, numpy, pdf2image, or pytesseract is not installed. Install these packages to use image processing features.")
        
        pages = convert_from_path(file)
        extracted_text = []
        for page in pages:
            # Step 1: Preprocess the image (deskew)
            preprocessed_image = self.deskew(np.array(page))
            # Step 2: Extract text using OCR
            text = self.get_text_from_image(preprocessed_image)
            extracted_text.append(text)
        return "\n".join(extracted_text).strip().lower()

    def read_resume_files(self):
        """
        Reads resume files from the specified directory and stores the content in resume_data attribute.
        If the resume file is a PDF containing images, OCR is used to extract text.

        Returns:
            dict: A dictionary with resume identifiers as keys and the corresponding resume texts as values.
        """
        file_list = glob(self.path_to_resumes, recursive=True)
        for file in tqdm(file_list):
            file_parts = os.path.normpath(file).split(os.sep)
            # The job title would be the name of the directory just before the file name
            job_title = file_parts[-2].replace(" ", "_").lower()      
            # The resume name would be the file name without the extension
            resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "")   
            data = self.extract_text_from_pdf_path(file)
            if len(data) > 1:
                self.resume_data[job_title + "_" + resume_name] = data
            else:  # to solve for incorrect startxref pointer(3), since they are images in pdf
                self.resume_data[job_title + "_" + resume_name] = self.extract_text_from_image(file)
        return self.resume_data


    @staticmethod
    def deskew(image):
        if not CV2_AVAILABLE:
            raise ImportError("OpenCV is not installed. Install opencv-python to use this feature.")
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        gray = cv2.bitwise_not(gray)
        coords = np.column_stack(np.where(gray > 0))
        angle = cv2.minAreaRect(coords)[-1]

        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated

    @staticmethod
    def get_text_from_image(image):
        if not CV2_AVAILABLE:
            raise ImportError("Pytesseract is not installed. Install pytesseract to use this feature.")
        
        text = pytesseract.image_to_string(image)
        return text

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from a PDF file uploaded via Streamlit"""
        try:
            # Check file size
            file_size = len(pdf_file.getvalue()) / (1024 * 1024)  # Size in MB
            if file_size > 5:
                return "Error: File size exceeds 5MB limit. Please upload a smaller file."
            
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.getvalue()))
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() or ""  # Handle None returns
            
            if not text.strip():
                return "Error: Could not extract text from PDF. The file might be scanned or image-based."
            
            return text
        except Exception as e:
            return f"Error processing PDF: {str(e)}"