Spaces:

shekkari21
/

resume-analyzer

Sleeping

App Files Files Community

resume-analyzer / Code+Folder /src /directory_reader.py

shekkari21

updated scripts

1a1a2a1 17 days ago

raw

history blame contribute delete

5.69 kB

	try:
	import cv2
	import numpy as np
	from pdf2image import convert_from_path
	import pytesseract
	CV2_AVAILABLE = True
	except ImportError:
	CV2_AVAILABLE = False

	from glob import glob
	from pypdf import PdfReader
	from tqdm import tqdm
	import os
	import PyPDF2
	import io

	class DirectoryReader:
	"""
	A class to read and process job description (JD) files and resume files from specified directories.
	"""
	def __init__(self, path_to_jds, path_to_resumes):
	"""
	Initializes the DirectoryReader with paths to job descriptions and resumes.

	Args:
	path_to_jds (str): Path to the directory containing job description files.
	path_to_resumes (str): Path to the directory containing resume files.
	"""
	self.path_to_jds = path_to_jds
	self.path_to_resumes = path_to_resumes
	self.jd_data = {}
	self.resume_data = {}

	def read_jd_files(self):
	"""
	Reads job description files from the specified directory and stores the content in jd_data attribute.

	Returns:
	dict: A dictionary with job names as keys and the corresponding job descriptions as values.
	"""
	file_list = glob(self.path_to_jds, recursive=True)
	for file in tqdm(file_list):
	with open(file, "r", encoding="utf-8") as f:
	data = f.read()
	data = data.strip().lower()
	job_name = file.split("/")[-1].replace(".txt", "")
	self.jd_data[job_name] = data
	return self.jd_data

	@staticmethod
	def extract_text_from_pdf_path(file_path):
	"""Extract text from a PDF file given a file path (for batch processing)."""
	reader = PdfReader(file_path)
	data = ""
	for page in reader.pages:
	data = data + page.extract_text() + "\n"
	data = data.strip().lower()
	return data

	def extract_text_from_image(self, file):
	if not CV2_AVAILABLE:
	raise ImportError("OpenCV, numpy, pdf2image, or pytesseract is not installed. Install these packages to use image processing features.")

	pages = convert_from_path(file)
	extracted_text = []
	for page in pages:
	# Step 1: Preprocess the image (deskew)
	preprocessed_image = self.deskew(np.array(page))
	# Step 2: Extract text using OCR
	text = self.get_text_from_image(preprocessed_image)
	extracted_text.append(text)
	return "\n".join(extracted_text).strip().lower()

	def read_resume_files(self):
	"""
	Reads resume files from the specified directory and stores the content in resume_data attribute.
	If the resume file is a PDF containing images, OCR is used to extract text.

	Returns:
	dict: A dictionary with resume identifiers as keys and the corresponding resume texts as values.
	"""
	file_list = glob(self.path_to_resumes, recursive=True)
	for file in tqdm(file_list):
	file_parts = os.path.normpath(file).split(os.sep)
	# The job title would be the name of the directory just before the file name
	job_title = file_parts[-2].replace(" ", "_").lower()
	# The resume name would be the file name without the extension
	resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "")
	data = self.extract_text_from_pdf_path(file)
	if len(data) > 1:
	self.resume_data[job_title + "_" + resume_name] = data
	else: # to solve for incorrect startxref pointer(3), since they are images in pdf
	self.resume_data[job_title + "_" + resume_name] = self.extract_text_from_image(file)
	return self.resume_data


	@staticmethod
	def deskew(image):
	if not CV2_AVAILABLE:
	raise ImportError("OpenCV is not installed. Install opencv-python to use this feature.")

	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	gray = cv2.bitwise_not(gray)
	coords = np.column_stack(np.where(gray > 0))
	angle = cv2.minAreaRect(coords)[-1]

	if angle < -45:
	angle = -(90 + angle)
	else:
	angle = -angle

	(h, w) = image.shape[:2]
	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, angle, 1.0)
	rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
	return rotated

	@staticmethod
	def get_text_from_image(image):
	if not CV2_AVAILABLE:
	raise ImportError("Pytesseract is not installed. Install pytesseract to use this feature.")

	text = pytesseract.image_to_string(image)
	return text

	def extract_text_from_pdf(self, pdf_file):
	"""Extract text from a PDF file uploaded via Streamlit"""
	try:
	# Check file size
	file_size = len(pdf_file.getvalue()) / (1024 * 1024) # Size in MB
	if file_size > 5:
	return "Error: File size exceeds 5MB limit. Please upload a smaller file."

	pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.getvalue()))
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() or "" # Handle None returns

	if not text.strip():
	return "Error: Could not extract text from PDF. The file might be scanned or image-based."

	return text
	except Exception as e:
	return f"Error processing PDF: {str(e)}"