Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| from typing import Dict, List, Optional, Tuple | |
| import PyPDF2 | |
| import docx2txt | |
| from PIL import Image | |
| import pytesseract | |
| import io | |
| class ResumeScanner: | |
| """Simple resume text extractor - no complex analysis needed for vector search""" | |
| def __init__(self): | |
| pass | |
| def extract_text_from_file(self, file_content: bytes, filename: str) -> str: | |
| """Extract text from various file formats.""" | |
| file_ext = filename.lower().split('.')[-1] | |
| try: | |
| if file_ext == 'pdf': | |
| return self._extract_from_pdf(file_content) | |
| elif file_ext in ['doc', 'docx']: | |
| return self._extract_from_docx(file_content) | |
| elif file_ext in ['txt']: | |
| return file_content.decode('utf-8') | |
| elif file_ext in ['jpg', 'jpeg', 'png', 'bmp', 'tiff']: | |
| return self._extract_from_image(file_content) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_ext}") | |
| except Exception as e: | |
| print(f"β Error extracting text from {filename}: {e}") | |
| return "" | |
| def _extract_from_pdf(self, file_content: bytes) -> str: | |
| """Extract text from PDF file.""" | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"β Error reading PDF: {e}") | |
| return "" | |
| def _extract_from_docx(self, file_content: bytes) -> str: | |
| """Extract text from DOCX file.""" | |
| try: | |
| return docx2txt.process(io.BytesIO(file_content)) | |
| except Exception as e: | |
| print(f"β Error reading DOCX: {e}") | |
| return "" | |
| def _extract_from_image(self, file_content: bytes) -> str: | |
| """Extract text from image using OCR.""" | |
| try: | |
| image = Image.open(io.BytesIO(file_content)) | |
| # Use OCR to extract text | |
| text = pytesseract.image_to_string(image) | |
| return text | |
| except Exception as e: | |
| print(f"β Error reading image with OCR: {e}") | |
| return "" | |
| def clean_extracted_text(self, text: str) -> str: | |
| """Clean and optimize extracted text for better vector search.""" | |
| if not text: | |
| return "" | |
| # Remove excessive whitespace and newlines | |
| text = re.sub(r'\n+', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters that might interfere with search | |
| text = re.sub(r'[^\w\s.,@-]', ' ', text) | |
| # Trim and return | |
| return text.strip() | |
| # Global instance | |
| resume_scanner = ResumeScanner() |