Spaces:

ruslanmv
/

CV_Ranking

Sleeping

App Files Files Community

CV_Ranking / app.py

ruslanmv

Update app.py

250e7d4 10 months ago

raw

history blame contribute delete

6.75 kB

	import os
	import gradio as gr
	import PyPDF2
	import docx2txt
	import logging

	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	nltk.download('punkt_tab')
	# Configure logging
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

	# ----------------------------------------------------------------------------
	# 1) Utility Functions: Parsing & Preprocessing
	# ----------------------------------------------------------------------------

	def extract_text_from_pdf(file_obj):
	"""Extract all text from a PDF file object."""
	text_content = []
	try:
	logging.info("Loading PDF file.")
	pdf_reader = PyPDF2.PdfReader(file_obj)
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text_content.append(page_text)
	extracted_text = "\n".join(text_content)
	logging.info(f"Extracted PDF content: {extracted_text[:500]}...")

	print(extracted_text) # Print the extracted text

	return extracted_text
	except Exception as e:
	logging.error(f"Error reading PDF: {e}")
	return f"Error reading PDF: {e}"

	def extract_text_from_docx(file_path):
	"""Extract all text from a DOCX file on disk."""
	try:
	logging.info("Loading DOCX file.")
	extracted_text = docx2txt.process(file_path)
	logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")

	print(extracted_text) # Print the extracted text

	return extracted_text
	except Exception as e:
	logging.error(f"Error reading DOCX: {e}")
	return f"Error reading DOCX: {e}"

	def extract_text_from_txt(file_obj):
	"""Extract all text from a TXT file object."""
	try:
	logging.info("Loading TXT file.")
	extracted_text = file_obj.read().decode("utf-8", errors="ignore")
	logging.info(f"Extracted TXT content: {extracted_text[:500]}...")

	print(extracted_text) # Print the extracted text

	return extracted_text
	except Exception as e:
	logging.error(f"Error reading TXT: {e}")
	return f"Error reading TXT: {e}"

	def preprocess_text(text):
	"""
	Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
	and then rejoin into a clean string.
	"""
	logging.info("Preprocessing text.")
	text = str(text).lower()
	tokens = word_tokenize(text)
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
	processed_text = " ".join(filtered_tokens)
	logging.info(f"Preprocessed text: {processed_text[:500]}...")
	return processed_text

	# ----------------------------------------------------------------------------
	# 2) Core Ranking Logic with TF-IDF & Cosine Similarity
	# ----------------------------------------------------------------------------

	def rank_resumes_with_tfidf(job_description: str, resumes: dict):
	logging.info("Ranking resumes using TF-IDF.")
	preprocessed_jd = preprocess_text(job_description)
	preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
	corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
	filenames = list(preprocessed_resumes.keys())
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(corpus)
	jd_vector = tfidf_matrix[0:1]
	resume_vectors = tfidf_matrix[1:]
	similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
	results = list(zip(filenames, similarities))
	results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
	logging.info(f"Ranking completed: {results_sorted}")
	return results_sorted

	# ----------------------------------------------------------------------------
	# 3) Gradio Callback Function
	# ----------------------------------------------------------------------------

	def analyze_cvs(job_description, cv_files):
	logging.info("Starting CV analysis.")
	resumes_data = {}

	for uploaded_file in cv_files:

	filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes

	file_ext = os.path.splitext(filename)[1].lower()
	temp_filepath = None

	try:
	logging.info(f"Processing file: {filename}")
	if file_ext == ".pdf":
	with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
	file_content = extract_text_from_pdf(f)
	elif file_ext == ".txt":
	with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
	file_content = extract_text_from_txt(f)
	elif file_ext == ".docx":
	file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
	else:
	file_content = "Unsupported file type."
	except Exception as e:
	logging.error(f"Error processing file: {e}")
	file_content = f"Error processing file: {e}"

	logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
	resumes_data[filename] = file_content

	ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
	display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
	logging.info("Analysis completed successfully.")
	return display_data

	# ----------------------------------------------------------------------------
	# 4) Gradio Interface
	# ----------------------------------------------------------------------------

	def create_gradio_interface():
	job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
	cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
	results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
	demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
	return demo

	# ----------------------------------------------------------------------------
	# 5) Main Script
	# ----------------------------------------------------------------------------

	if __name__ == "__main__":
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	app = create_gradio_interface()
	app.launch(server_name="0.0.0.0", server_port=7860, debug=True)