File size: 5,688 Bytes
9607899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a1a2a1
 
 
9607899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a1a2a1
9607899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
try:
    import cv2
    import numpy as np
    from pdf2image import convert_from_path
    import pytesseract
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False
    
from glob import glob
from pypdf import PdfReader
from tqdm import tqdm
import os
import PyPDF2
import io

class DirectoryReader:
    """
    A class to read and process job description (JD) files and resume files from specified directories.
    """
    def __init__(self, path_to_jds, path_to_resumes):
        """
       Initializes the DirectoryReader with paths to job descriptions and resumes.

       Args:
           path_to_jds (str): Path to the directory containing job description files.
           path_to_resumes (str): Path to the directory containing resume files.
       """
        self.path_to_jds = path_to_jds
        self.path_to_resumes = path_to_resumes
        self.jd_data = {}
        self.resume_data = {}

    def read_jd_files(self):
        """
        Reads job description files from the specified directory and stores the content in jd_data attribute.

        Returns:
            dict: A dictionary with job names as keys and the corresponding job descriptions as values.
        """
        file_list = glob(self.path_to_jds, recursive=True)
        for file in tqdm(file_list):
            with open(file, "r", encoding="utf-8") as f:
                data = f.read()
                data = data.strip().lower()
                job_name = file.split("/")[-1].replace(".txt", "")
                self.jd_data[job_name] = data
        return self.jd_data

    @staticmethod
    def extract_text_from_pdf_path(file_path):
        """Extract text from a PDF file given a file path (for batch processing)."""
        reader = PdfReader(file_path)
        data = ""
        for page in reader.pages:
            data = data + page.extract_text() + "\n"
        data = data.strip().lower()
        return data

    def extract_text_from_image(self, file):
        if not CV2_AVAILABLE:
            raise ImportError("OpenCV, numpy, pdf2image, or pytesseract is not installed. Install these packages to use image processing features.")
        
        pages = convert_from_path(file)
        extracted_text = []
        for page in pages:
            # Step 1: Preprocess the image (deskew)
            preprocessed_image = self.deskew(np.array(page))
            # Step 2: Extract text using OCR
            text = self.get_text_from_image(preprocessed_image)
            extracted_text.append(text)
        return "\n".join(extracted_text).strip().lower()

    def read_resume_files(self):
        """
        Reads resume files from the specified directory and stores the content in resume_data attribute.
        If the resume file is a PDF containing images, OCR is used to extract text.

        Returns:
            dict: A dictionary with resume identifiers as keys and the corresponding resume texts as values.
        """
        file_list = glob(self.path_to_resumes, recursive=True)
        for file in tqdm(file_list):
            file_parts = os.path.normpath(file).split(os.sep)
            # The job title would be the name of the directory just before the file name
            job_title = file_parts[-2].replace(" ", "_").lower()      
            # The resume name would be the file name without the extension
            resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "")   
            data = self.extract_text_from_pdf_path(file)
            if len(data) > 1:
                self.resume_data[job_title + "_" + resume_name] = data
            else:  # to solve for incorrect startxref pointer(3), since they are images in pdf
                self.resume_data[job_title + "_" + resume_name] = self.extract_text_from_image(file)
        return self.resume_data


    @staticmethod
    def deskew(image):
        if not CV2_AVAILABLE:
            raise ImportError("OpenCV is not installed. Install opencv-python to use this feature.")
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        gray = cv2.bitwise_not(gray)
        coords = np.column_stack(np.where(gray > 0))
        angle = cv2.minAreaRect(coords)[-1]

        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated

    @staticmethod
    def get_text_from_image(image):
        if not CV2_AVAILABLE:
            raise ImportError("Pytesseract is not installed. Install pytesseract to use this feature.")
        
        text = pytesseract.image_to_string(image)
        return text

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from a PDF file uploaded via Streamlit"""
        try:
            # Check file size
            file_size = len(pdf_file.getvalue()) / (1024 * 1024)  # Size in MB
            if file_size > 5:
                return "Error: File size exceeds 5MB limit. Please upload a smaller file."
            
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.getvalue()))
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() or ""  # Handle None returns
            
            if not text.strip():
                return "Error: Could not extract text from PDF. The file might be scanned or image-based."
            
            return text
        except Exception as e:
            return f"Error processing PDF: {str(e)}"