Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import PyPDF2 | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import re | |
| from typing import Dict, List | |
| class PDFExtractor: | |
| """Extract text from PDF resumes with fallback mechanisms""" | |
| def extract_text(pdf_path: str) -> str: | |
| """Extract text from PDF using pdfplumber, fallback to PyPDF2, then OCR""" | |
| try: | |
| # Primary: pdfplumber | |
| with pdfplumber.open(pdf_path) as pdf: | |
| text = "" | |
| for page in pdf.pages: | |
| text += page.extract_text() or "" | |
| if text.strip(): | |
| return PDFExtractor.normalize_text(text) | |
| except Exception as e: | |
| print(f"pdfplumber failed: {e}, trying PyPDF2...") | |
| try: | |
| # Fallback 1: PyPDF2 | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| if text.strip(): | |
| return PDFExtractor.normalize_text(text) | |
| except Exception as e: | |
| print(f"PyPDF2 failed: {e}, trying OCR...") | |
| try: | |
| # Fallback 2: OCR with Tesseract | |
| print("Attempting OCR extraction (this may take a moment)...") | |
| images = convert_from_path(pdf_path, dpi=300) | |
| text = "" | |
| for i, image in enumerate(images): | |
| page_text = pytesseract.image_to_string(image, lang='eng') | |
| text += page_text + "\n" | |
| print(f"OCR page {i+1}: extracted {len(page_text)} chars") | |
| if text.strip(): | |
| return PDFExtractor.normalize_text(text) | |
| else: | |
| raise Exception("OCR extraction returned empty text") | |
| except Exception as e: | |
| raise Exception(f"All extraction methods failed. Last error (OCR): {e}") | |
| def normalize_text(text: str) -> str: | |
| """Clean and normalize extracted text""" | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters but keep punctuation | |
| text = re.sub(r'[^\w\s.,;:()\-@]', '', text) | |
| return text.strip() | |
| def detect_sections(text: str) -> Dict[str, str]: | |
| """Detect common resume sections""" | |
| sections = { | |
| 'experience': '', | |
| 'education': '', | |
| 'skills': '', | |
| 'projects': '', | |
| 'summary': '', | |
| 'other': '' | |
| } | |
| # Section headers patterns | |
| patterns = { | |
| 'experience': r'(experience|work history|employment|professional experience)', | |
| 'education': r'(education|academic|qualifications|degrees)', | |
| 'skills': r'(skills|technical skills|competencies|expertise)', | |
| 'projects': r'(projects|portfolio|work samples)', | |
| 'summary': r'(summary|objective|profile|about)' | |
| } | |
| text_lower = text.lower() | |
| # Find section boundaries | |
| section_positions = [] | |
| for section_name, pattern in patterns.items(): | |
| matches = list(re.finditer(pattern, text_lower)) | |
| for match in matches: | |
| section_positions.append((match.start(), section_name)) | |
| # Sort by position | |
| section_positions.sort() | |
| # Extract section content | |
| for i, (start_pos, section_name) in enumerate(section_positions): | |
| end_pos = section_positions[i + 1][0] if i + 1 < len(section_positions) else len(text) | |
| sections[section_name] = text[start_pos:end_pos].strip() | |
| # If no sections detected, put everything in 'other' | |
| if not any(sections.values()): | |
| sections['other'] = text | |
| return sections | |