| """ |
| PDF Parser utility using PyMuPDF (fitz). |
| Extracts raw text from PDF files with page tracking. |
| """ |
|
|
| import fitz |
|
|
|
|
| class PDFExtractionError(Exception): |
| """Custom exception for PDF extraction errors.""" |
|
|
| pass |
|
|
|
|
| def extract_text_from_pdf(pdf_content: bytes) -> str: |
| """ |
| Extract full text from a PDF file. |
| |
| Raises: |
| PDFExtractionError: If the PDF is corrupted, empty, or cannot be read. |
| """ |
| try: |
| doc = fitz.open(stream=pdf_content, filetype="pdf") |
| except Exception as e: |
| raise PDFExtractionError(f"Impossible d'ouvrir le PDF: {str(e)}") |
|
|
| if len(doc) == 0: |
| doc.close() |
| raise PDFExtractionError("Le PDF est vide (aucune page).") |
|
|
| full_text = [] |
| for page_num in range(len(doc)): |
| try: |
| page = doc.load_page(page_num) |
| text = page.get_text("text") |
| if text.strip(): |
| full_text.append(f"--- PAGE {page_num + 1} ---\n{text}") |
| except Exception as e: |
| raise PDFExtractionError( |
| f"Erreur lors de l'extraction de la page {page_num + 1}: {str(e)}" |
| ) |
|
|
| doc.close() |
|
|
| if not full_text: |
| raise PDFExtractionError( |
| "Le PDF ne contient aucun texte extractible (scanné ou image uniquement)." |
| ) |
|
|
| return "\n\n".join(full_text) |
|
|
|
|
| def extract_text_from_uploaded_file(uploaded_file) -> str: |
| """ |
| Extract text from a Streamlit uploaded file object. |
| |
| Raises: |
| PDFExtractionError: If extraction fails. |
| """ |
| try: |
| pdf_bytes = uploaded_file.read() |
| except Exception as e: |
| raise PDFExtractionError(f"Impossible de lire le fichier: {str(e)}") |
|
|
| if len(pdf_bytes) == 0: |
| raise PDFExtractionError("Le fichier est vide.") |
|
|
| uploaded_file.seek(0) |
| return extract_text_from_pdf(pdf_bytes) |
|
|
|
|
| def get_page_count(pdf_content: bytes) -> int: |
| """Get the number of pages in a PDF.""" |
| try: |
| doc = fitz.open(stream=pdf_content, filetype="pdf") |
| count = len(doc) |
| doc.close() |
| return count |
| except Exception: |
| return 0 |
|
|