| import pymupdf |
| import easyocr |
| import io |
| import os |
|
|
| |
| |
| reader = easyocr.Reader(['fr', 'en'], gpu=False) |
|
|
| def extract_pdf_text_with_easyocr( |
| file_path: str, |
| zoom: float = 3.0, |
| min_text_length: int = 50, |
| save_to_file: str = None |
| ) -> str: |
| """ |
| Extract text from a PDF using EasyOCR with OCR fallback for image-based pages. |
| Returns clean concatenated text. |
| """ |
| if not os.path.exists(file_path): |
| raise FileNotFoundError(f"PDF file not found: {file_path}") |
|
|
| try: |
| doc = pymupdf.open(file_path) |
| except Exception as e: |
| raise RuntimeError(f"Failed to open PDF with PyMuPDF: {e}") |
|
|
| full_text = "" |
| total_pages = len(doc) |
|
|
| for page_num in range(total_pages): |
| page = doc[page_num] |
|
|
| |
| mat = pymupdf.Matrix(zoom, zoom) |
| pix = page.get_pixmap(matrix=mat, colorspace=pymupdf.csRGB) |
|
|
| |
| img_bytes = pix.tobytes("png") |
|
|
| |
| try: |
| result = reader.readtext( |
| img_bytes, |
| detail=0, |
| paragraph=True, |
| width_ths=0.7, |
| height_ths=0.7 |
| ) |
| page_text = "\n".join([line.strip() for line in result if line.strip()]) |
| except Exception as ocr_error: |
| page_text = f"[OCR Error on page {page_num + 1}: {ocr_error}]" |
|
|
| |
| if len(page_text) > min_text_length or "OCR Error" in page_text: |
| full_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n" |
|
|
| doc.close() |
|
|
| |
| if save_to_file: |
| try: |
| with open(save_to_file, "w", encoding="utf-8") as f: |
| f.write(full_text) |
| print(f"Extracted text saved to: {save_to_file}") |
| except Exception as e: |
| print(f"Failed to save file: {e}") |
|
|
| return full_text.strip() |
|
|
| |
| if __name__ == "__main__": |
| pdf_file = "Kbis.pdf" |
|
|
| extracted_text = extract_pdf_text_with_easyocr( |
| file_path=pdf_file, |
| zoom=3.5, |
| save_to_file="kbis_extracted.txt" |
| ) |
|
|
| |
| print("\n=== EXTRACTED TEXT PREVIEW ===\n") |
| print(extracted_text[:1000]) |
| print("\n... (truncated)" if len(extracted_text) > 1000 else "") |