Spaces:
Sleeping
Sleeping
| """ | |
| PDF Parser utility using PyMuPDF (fitz). | |
| Extracts raw text from PDF files with page tracking. | |
| """ | |
| import fitz # PyMuPDF | |
| class PDFExtractionError(Exception): | |
| """Custom exception for PDF extraction errors.""" | |
| pass | |
| def extract_text_from_pdf(pdf_content: bytes) -> str: | |
| """ | |
| Extract full text from a PDF file. | |
| Raises: | |
| PDFExtractionError: If the PDF is corrupted, empty, or cannot be read. | |
| """ | |
| try: | |
| doc = fitz.open(stream=pdf_content, filetype="pdf") | |
| except Exception as e: | |
| raise PDFExtractionError(f"Impossible d'ouvrir le PDF: {str(e)}") | |
| if len(doc) == 0: | |
| doc.close() | |
| raise PDFExtractionError("Le PDF est vide (aucune page).") | |
| full_text = [] | |
| for page_num in range(len(doc)): | |
| try: | |
| page = doc.load_page(page_num) | |
| text = page.get_text("text") | |
| if text.strip(): | |
| full_text.append(f"--- PAGE {page_num + 1} ---\n{text}") | |
| except Exception as e: | |
| raise PDFExtractionError( | |
| f"Erreur lors de l'extraction de la page {page_num + 1}: {str(e)}" | |
| ) | |
| doc.close() | |
| if not full_text: | |
| raise PDFExtractionError( | |
| "Le PDF ne contient aucun texte extractible (scanné ou image uniquement)." | |
| ) | |
| return "\n\n".join(full_text) | |
| def extract_text_from_uploaded_file(uploaded_file) -> str: | |
| """ | |
| Extract text from a Streamlit uploaded file object. | |
| Raises: | |
| PDFExtractionError: If extraction fails. | |
| """ | |
| try: | |
| pdf_bytes = uploaded_file.read() | |
| except Exception as e: | |
| raise PDFExtractionError(f"Impossible de lire le fichier: {str(e)}") | |
| if len(pdf_bytes) == 0: | |
| raise PDFExtractionError("Le fichier est vide.") | |
| uploaded_file.seek(0) # Reset for potential re-read | |
| return extract_text_from_pdf(pdf_bytes) | |
| def get_page_count(pdf_content: bytes) -> int: | |
| """Get the number of pages in a PDF.""" | |
| try: | |
| doc = fitz.open(stream=pdf_content, filetype="pdf") | |
| count = len(doc) | |
| doc.close() | |
| return count | |
| except Exception: | |
| return 0 | |