Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| from app.utils.text_cleaner import TextCleaner | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class ResumeParser: | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| try: | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return TextCleaner.clean_text(text) | |
| except Exception as e: | |
| logger.error(f"Error extracting text from PDF: {e}") | |
| raise ValueError(f"Could not parse PDF file: {e}") | |
| def extract_metadata(doc: fitz.Document, file_path: str) -> dict: | |
| metadata = doc.metadata | |
| return { | |
| "source": file_path, | |
| "page_count": doc.page_count, | |
| "author": metadata.get("author"), | |
| "creation_date": metadata.get("creationDate"), | |
| "producer": metadata.get("producer") | |
| } | |
| def parse_file(file_path: str) -> dict: | |
| if file_path.lower().endswith(".pdf"): | |
| try: | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| cleaned_text = TextCleaner.clean_text(text) | |
| metadata = ResumeParser.extract_metadata(doc, file_path) | |
| return { | |
| "content": cleaned_text, | |
| "metadata": metadata | |
| } | |
| except Exception as e: | |
| logger.error(f"Error extracting text from PDF: {e}") | |
| raise ValueError(f"Could not parse PDF file: {e}") | |
| else: | |
| raise ValueError("Unsupported file format. Only PDF is supported.") | |