resume-analyzer / Code+Folder /src /directory_reader.py
shekkari21's picture
updated scripts
1a1a2a1
try:
import cv2
import numpy as np
from pdf2image import convert_from_path
import pytesseract
CV2_AVAILABLE = True
except ImportError:
CV2_AVAILABLE = False
from glob import glob
from pypdf import PdfReader
from tqdm import tqdm
import os
import PyPDF2
import io
class DirectoryReader:
"""
A class to read and process job description (JD) files and resume files from specified directories.
"""
def __init__(self, path_to_jds, path_to_resumes):
"""
Initializes the DirectoryReader with paths to job descriptions and resumes.
Args:
path_to_jds (str): Path to the directory containing job description files.
path_to_resumes (str): Path to the directory containing resume files.
"""
self.path_to_jds = path_to_jds
self.path_to_resumes = path_to_resumes
self.jd_data = {}
self.resume_data = {}
def read_jd_files(self):
"""
Reads job description files from the specified directory and stores the content in jd_data attribute.
Returns:
dict: A dictionary with job names as keys and the corresponding job descriptions as values.
"""
file_list = glob(self.path_to_jds, recursive=True)
for file in tqdm(file_list):
with open(file, "r", encoding="utf-8") as f:
data = f.read()
data = data.strip().lower()
job_name = file.split("/")[-1].replace(".txt", "")
self.jd_data[job_name] = data
return self.jd_data
@staticmethod
def extract_text_from_pdf_path(file_path):
"""Extract text from a PDF file given a file path (for batch processing)."""
reader = PdfReader(file_path)
data = ""
for page in reader.pages:
data = data + page.extract_text() + "\n"
data = data.strip().lower()
return data
def extract_text_from_image(self, file):
if not CV2_AVAILABLE:
raise ImportError("OpenCV, numpy, pdf2image, or pytesseract is not installed. Install these packages to use image processing features.")
pages = convert_from_path(file)
extracted_text = []
for page in pages:
# Step 1: Preprocess the image (deskew)
preprocessed_image = self.deskew(np.array(page))
# Step 2: Extract text using OCR
text = self.get_text_from_image(preprocessed_image)
extracted_text.append(text)
return "\n".join(extracted_text).strip().lower()
def read_resume_files(self):
"""
Reads resume files from the specified directory and stores the content in resume_data attribute.
If the resume file is a PDF containing images, OCR is used to extract text.
Returns:
dict: A dictionary with resume identifiers as keys and the corresponding resume texts as values.
"""
file_list = glob(self.path_to_resumes, recursive=True)
for file in tqdm(file_list):
file_parts = os.path.normpath(file).split(os.sep)
# The job title would be the name of the directory just before the file name
job_title = file_parts[-2].replace(" ", "_").lower()
# The resume name would be the file name without the extension
resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "")
data = self.extract_text_from_pdf_path(file)
if len(data) > 1:
self.resume_data[job_title + "_" + resume_name] = data
else: # to solve for incorrect startxref pointer(3), since they are images in pdf
self.resume_data[job_title + "_" + resume_name] = self.extract_text_from_image(file)
return self.resume_data
@staticmethod
def deskew(image):
if not CV2_AVAILABLE:
raise ImportError("OpenCV is not installed. Install opencv-python to use this feature.")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.bitwise_not(gray)
coords = np.column_stack(np.where(gray > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated
@staticmethod
def get_text_from_image(image):
if not CV2_AVAILABLE:
raise ImportError("Pytesseract is not installed. Install pytesseract to use this feature.")
text = pytesseract.image_to_string(image)
return text
def extract_text_from_pdf(self, pdf_file):
"""Extract text from a PDF file uploaded via Streamlit"""
try:
# Check file size
file_size = len(pdf_file.getvalue()) / (1024 * 1024) # Size in MB
if file_size > 5:
return "Error: File size exceeds 5MB limit. Please upload a smaller file."
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.getvalue()))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or "" # Handle None returns
if not text.strip():
return "Error: Could not extract text from PDF. The file might be scanned or image-based."
return text
except Exception as e:
return f"Error processing PDF: {str(e)}"