Spaces:
Sleeping
Sleeping
File size: 5,688 Bytes
9607899 1a1a2a1 9607899 1a1a2a1 9607899 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | try:
import cv2
import numpy as np
from pdf2image import convert_from_path
import pytesseract
CV2_AVAILABLE = True
except ImportError:
CV2_AVAILABLE = False
from glob import glob
from pypdf import PdfReader
from tqdm import tqdm
import os
import PyPDF2
import io
class DirectoryReader:
"""
A class to read and process job description (JD) files and resume files from specified directories.
"""
def __init__(self, path_to_jds, path_to_resumes):
"""
Initializes the DirectoryReader with paths to job descriptions and resumes.
Args:
path_to_jds (str): Path to the directory containing job description files.
path_to_resumes (str): Path to the directory containing resume files.
"""
self.path_to_jds = path_to_jds
self.path_to_resumes = path_to_resumes
self.jd_data = {}
self.resume_data = {}
def read_jd_files(self):
"""
Reads job description files from the specified directory and stores the content in jd_data attribute.
Returns:
dict: A dictionary with job names as keys and the corresponding job descriptions as values.
"""
file_list = glob(self.path_to_jds, recursive=True)
for file in tqdm(file_list):
with open(file, "r", encoding="utf-8") as f:
data = f.read()
data = data.strip().lower()
job_name = file.split("/")[-1].replace(".txt", "")
self.jd_data[job_name] = data
return self.jd_data
@staticmethod
def extract_text_from_pdf_path(file_path):
"""Extract text from a PDF file given a file path (for batch processing)."""
reader = PdfReader(file_path)
data = ""
for page in reader.pages:
data = data + page.extract_text() + "\n"
data = data.strip().lower()
return data
def extract_text_from_image(self, file):
if not CV2_AVAILABLE:
raise ImportError("OpenCV, numpy, pdf2image, or pytesseract is not installed. Install these packages to use image processing features.")
pages = convert_from_path(file)
extracted_text = []
for page in pages:
# Step 1: Preprocess the image (deskew)
preprocessed_image = self.deskew(np.array(page))
# Step 2: Extract text using OCR
text = self.get_text_from_image(preprocessed_image)
extracted_text.append(text)
return "\n".join(extracted_text).strip().lower()
def read_resume_files(self):
"""
Reads resume files from the specified directory and stores the content in resume_data attribute.
If the resume file is a PDF containing images, OCR is used to extract text.
Returns:
dict: A dictionary with resume identifiers as keys and the corresponding resume texts as values.
"""
file_list = glob(self.path_to_resumes, recursive=True)
for file in tqdm(file_list):
file_parts = os.path.normpath(file).split(os.sep)
# The job title would be the name of the directory just before the file name
job_title = file_parts[-2].replace(" ", "_").lower()
# The resume name would be the file name without the extension
resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "")
data = self.extract_text_from_pdf_path(file)
if len(data) > 1:
self.resume_data[job_title + "_" + resume_name] = data
else: # to solve for incorrect startxref pointer(3), since they are images in pdf
self.resume_data[job_title + "_" + resume_name] = self.extract_text_from_image(file)
return self.resume_data
@staticmethod
def deskew(image):
if not CV2_AVAILABLE:
raise ImportError("OpenCV is not installed. Install opencv-python to use this feature.")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.bitwise_not(gray)
coords = np.column_stack(np.where(gray > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated
@staticmethod
def get_text_from_image(image):
if not CV2_AVAILABLE:
raise ImportError("Pytesseract is not installed. Install pytesseract to use this feature.")
text = pytesseract.image_to_string(image)
return text
def extract_text_from_pdf(self, pdf_file):
"""Extract text from a PDF file uploaded via Streamlit"""
try:
# Check file size
file_size = len(pdf_file.getvalue()) / (1024 * 1024) # Size in MB
if file_size > 5:
return "Error: File size exceeds 5MB limit. Please upload a smaller file."
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.getvalue()))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or "" # Handle None returns
if not text.strip():
return "Error: Could not extract text from PDF. The file might be scanned or image-based."
return text
except Exception as e:
return f"Error processing PDF: {str(e)}"
|