import os import re import tempfile import pytesseract import PyPDF2 import docx from PIL import Image from pdf2image import convert_from_path def clean_text(text): if not text: return "" text = str(text) text = re.sub(r"\s+", " ", text) return text.strip() def extract_text_from_image(file_path): try: img = Image.open(file_path) text = pytesseract.image_to_string(img) return clean_text(text) except: return "" def extract_text_from_docx(file_path): try: doc = docx.Document(file_path) text = " ".join([p.text for p in doc.paragraphs]) return clean_text(text) except: return "" def extract_text_from_pdf(file_path): text = "" try: with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + " " except: pass if len(text.strip()) < 100: try: images = convert_from_path(file_path) for image in images: with tempfile.NamedTemporaryFile(suffix=".png") as tmp: image.save(tmp.name) text += extract_text_from_image(tmp.name) except: pass return clean_text(text) def parse_resume(file_path): ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": return extract_text_from_pdf(file_path) if ext in [".docx", ".doc"]: return extract_text_from_docx(file_path) if ext in [".png", ".jpg", ".jpeg"]: return extract_text_from_image(file_path) return ""