Spaces:
Sleeping
Sleeping
| import re | |
| import os | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| from utils.text_processing import clean_text | |
| def extract_text_from_pdf(pdf_path): | |
| text = "" | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| reader = PdfReader(file) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| except Exception as e: | |
| print(f"Error processing PDF: {e}") | |
| return text | |
| def extract_text_from_docx(docx_path): | |
| text = "" | |
| try: | |
| doc = Document(docx_path) | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| text += para.text + "\n" | |
| except Exception as e: | |
| print(f"Error processing DOCX: {e}") | |
| return text | |
| def process_resume(file_path): | |
| if file_path.endswith('.pdf'): | |
| text = extract_text_from_pdf(file_path) | |
| elif file_path.endswith('.docx'): | |
| text = extract_text_from_docx(file_path) | |
| else: | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| text = file.read() | |
| except: | |
| with open(file_path, 'r', encoding='latin-1') as file: | |
| text = file.read() | |
| return clean_text(text) | |
| def process_jd(file_path): | |
| return process_resume(file_path) |