Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import tempfile | |
| import pytesseract | |
| import PyPDF2 | |
| import docx | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| def clean_text(text): | |
| if not text: | |
| return "" | |
| text = str(text) | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def extract_text_from_image(file_path): | |
| try: | |
| img = Image.open(file_path) | |
| text = pytesseract.image_to_string(img) | |
| return clean_text(text) | |
| except: | |
| return "" | |
| def extract_text_from_docx(file_path): | |
| try: | |
| doc = docx.Document(file_path) | |
| text = " ".join([p.text for p in doc.paragraphs]) | |
| return clean_text(text) | |
| except: | |
| return "" | |
| def extract_text_from_pdf(file_path): | |
| text = "" | |
| try: | |
| with open(file_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + " " | |
| except: | |
| pass | |
| if len(text.strip()) < 100: | |
| try: | |
| images = convert_from_path(file_path) | |
| for image in images: | |
| with tempfile.NamedTemporaryFile(suffix=".png") as tmp: | |
| image.save(tmp.name) | |
| text += extract_text_from_image(tmp.name) | |
| except: | |
| pass | |
| return clean_text(text) | |
| def parse_resume(file_path): | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == ".pdf": | |
| return extract_text_from_pdf(file_path) | |
| if ext in [".docx", ".doc"]: | |
| return extract_text_from_docx(file_path) | |
| if ext in [".png", ".jpg", ".jpeg"]: | |
| return extract_text_from_image(file_path) | |
| return "" |