Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import PyPDF2 | |
| import docx2txt | |
| import logging | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| nltk.download('punkt_tab') | |
| # Configure logging | |
| logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
| # ---------------------------------------------------------------------------- | |
| # 1) Utility Functions: Parsing & Preprocessing | |
| # ---------------------------------------------------------------------------- | |
| def extract_text_from_pdf(file_obj): | |
| """Extract all text from a PDF file object.""" | |
| text_content = [] | |
| try: | |
| logging.info("Loading PDF file.") | |
| pdf_reader = PyPDF2.PdfReader(file_obj) | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_content.append(page_text) | |
| extracted_text = "\n".join(text_content) | |
| logging.info(f"Extracted PDF content: {extracted_text[:500]}...") | |
| print(extracted_text) # Print the extracted text | |
| return extracted_text | |
| except Exception as e: | |
| logging.error(f"Error reading PDF: {e}") | |
| return f"Error reading PDF: {e}" | |
| def extract_text_from_docx(file_path): | |
| """Extract all text from a DOCX file on disk.""" | |
| try: | |
| logging.info("Loading DOCX file.") | |
| extracted_text = docx2txt.process(file_path) | |
| logging.info(f"Extracted DOCX content: {extracted_text[:500]}...") | |
| print(extracted_text) # Print the extracted text | |
| return extracted_text | |
| except Exception as e: | |
| logging.error(f"Error reading DOCX: {e}") | |
| return f"Error reading DOCX: {e}" | |
| def extract_text_from_txt(file_obj): | |
| """Extract all text from a TXT file object.""" | |
| try: | |
| logging.info("Loading TXT file.") | |
| extracted_text = file_obj.read().decode("utf-8", errors="ignore") | |
| logging.info(f"Extracted TXT content: {extracted_text[:500]}...") | |
| print(extracted_text) # Print the extracted text | |
| return extracted_text | |
| except Exception as e: | |
| logging.error(f"Error reading TXT: {e}") | |
| return f"Error reading TXT: {e}" | |
| def preprocess_text(text): | |
| """ | |
| Lowercase, tokenize, remove stopwords and non-alphabetic tokens, | |
| and then rejoin into a clean string. | |
| """ | |
| logging.info("Preprocessing text.") | |
| text = str(text).lower() | |
| tokens = word_tokenize(text) | |
| stop_words = set(stopwords.words('english')) | |
| filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words] | |
| processed_text = " ".join(filtered_tokens) | |
| logging.info(f"Preprocessed text: {processed_text[:500]}...") | |
| return processed_text | |
| # ---------------------------------------------------------------------------- | |
| # 2) Core Ranking Logic with TF-IDF & Cosine Similarity | |
| # ---------------------------------------------------------------------------- | |
| def rank_resumes_with_tfidf(job_description: str, resumes: dict): | |
| logging.info("Ranking resumes using TF-IDF.") | |
| preprocessed_jd = preprocess_text(job_description) | |
| preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()} | |
| corpus = [preprocessed_jd] + list(preprocessed_resumes.values()) | |
| filenames = list(preprocessed_resumes.keys()) | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform(corpus) | |
| jd_vector = tfidf_matrix[0:1] | |
| resume_vectors = tfidf_matrix[1:] | |
| similarities = cosine_similarity(jd_vector, resume_vectors).flatten() | |
| results = list(zip(filenames, similarities)) | |
| results_sorted = sorted(results, key=lambda x: x[1], reverse=True) | |
| logging.info(f"Ranking completed: {results_sorted}") | |
| return results_sorted | |
| # ---------------------------------------------------------------------------- | |
| # 3) Gradio Callback Function | |
| # ---------------------------------------------------------------------------- | |
| def analyze_cvs(job_description, cv_files): | |
| logging.info("Starting CV analysis.") | |
| resumes_data = {} | |
| for uploaded_file in cv_files: | |
| filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes | |
| file_ext = os.path.splitext(filename)[1].lower() | |
| temp_filepath = None | |
| try: | |
| logging.info(f"Processing file: {filename}") | |
| if file_ext == ".pdf": | |
| with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio | |
| file_content = extract_text_from_pdf(f) | |
| elif file_ext == ".txt": | |
| with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio | |
| file_content = extract_text_from_txt(f) | |
| elif file_ext == ".docx": | |
| file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath | |
| else: | |
| file_content = "Unsupported file type." | |
| except Exception as e: | |
| logging.error(f"Error processing file: {e}") | |
| file_content = f"Error processing file: {e}" | |
| logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...") | |
| resumes_data[filename] = file_content | |
| ranked_results = rank_resumes_with_tfidf(job_description, resumes_data) | |
| display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results] | |
| logging.info("Analysis completed successfully.") | |
| return display_data | |
| # ---------------------------------------------------------------------------- | |
| # 4) Gradio Interface | |
| # ---------------------------------------------------------------------------- | |
| def create_gradio_interface(): | |
| job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4) | |
| cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath") | |
| results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates") | |
| demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF") | |
| return demo | |
| # ---------------------------------------------------------------------------- | |
| # 5) Main Script | |
| # ---------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('stopwords', quiet=True) | |
| app = create_gradio_interface() | |
| app.launch(server_name="0.0.0.0", server_port=7860, debug=True) |