Spaces:
Sleeping
Sleeping
File size: 2,145 Bytes
02a29f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | """
PDF Parser utility using PyMuPDF (fitz).
Extracts raw text from PDF files with page tracking.
"""
import fitz # PyMuPDF
class PDFExtractionError(Exception):
"""Custom exception for PDF extraction errors."""
pass
def extract_text_from_pdf(pdf_content: bytes) -> str:
"""
Extract full text from a PDF file.
Raises:
PDFExtractionError: If the PDF is corrupted, empty, or cannot be read.
"""
try:
doc = fitz.open(stream=pdf_content, filetype="pdf")
except Exception as e:
raise PDFExtractionError(f"Impossible d'ouvrir le PDF: {str(e)}")
if len(doc) == 0:
doc.close()
raise PDFExtractionError("Le PDF est vide (aucune page).")
full_text = []
for page_num in range(len(doc)):
try:
page = doc.load_page(page_num)
text = page.get_text("text")
if text.strip():
full_text.append(f"--- PAGE {page_num + 1} ---\n{text}")
except Exception as e:
raise PDFExtractionError(
f"Erreur lors de l'extraction de la page {page_num + 1}: {str(e)}"
)
doc.close()
if not full_text:
raise PDFExtractionError(
"Le PDF ne contient aucun texte extractible (scanné ou image uniquement)."
)
return "\n\n".join(full_text)
def extract_text_from_uploaded_file(uploaded_file) -> str:
"""
Extract text from a Streamlit uploaded file object.
Raises:
PDFExtractionError: If extraction fails.
"""
try:
pdf_bytes = uploaded_file.read()
except Exception as e:
raise PDFExtractionError(f"Impossible de lire le fichier: {str(e)}")
if len(pdf_bytes) == 0:
raise PDFExtractionError("Le fichier est vide.")
uploaded_file.seek(0) # Reset for potential re-read
return extract_text_from_pdf(pdf_bytes)
def get_page_count(pdf_content: bytes) -> int:
"""Get the number of pages in a PDF."""
try:
doc = fitz.open(stream=pdf_content, filetype="pdf")
count = len(doc)
doc.close()
return count
except Exception:
return 0
|