Spaces:

Cassius1Morbant
/

French_Legal_Chatbot

Running

App Files Files Community

Cassius1Morbant commited on Jan 19

Commit

593231c

verified ·

1 Parent(s): da25573

Upload extract.py

Browse files

Files changed (1) hide show

extract.py +84 -0

extract.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import pymupdf  # PyMuPDF
+import easyocr
+import io
+import os
+# Initialise EasyOCR once (French + English) — outside the function for efficiency
+# First run will download models (~400 MB); subsequent runs are fast.
+reader = easyocr.Reader(['fr', 'en'], gpu=False)  # Set gpu=True if you have CUDA
+def extract_pdf_text_with_easyocr(
+    file_path: str,
+    zoom: float = 3.0,           # 3.0 ≈ 300 DPI — good balance of quality/speed
+    min_text_length: int = 50,   # Ignore pages with very little text (likely blank)
+    save_to_file: str = None     # Optional: path to save extracted text
+) -> str:
+    """
+    Extract text from a PDF using EasyOCR with OCR fallback for image-based pages.
+    Returns clean concatenated text.
+    """
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"PDF file not found: {file_path}")
+    try:
+        doc = pymupdf.open(file_path)
+    except Exception as e:
+        raise RuntimeError(f"Failed to open PDF with PyMuPDF: {e}")
+    full_text = ""
+    total_pages = len(doc)
+    for page_num in range(total_pages):
+        page = doc[page_num]
+        # Render page to high-resolution pixmap
+        mat = pymupdf.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat, colorspace=pymupdf.csRGB)
+        # Convert to PNG bytes
+        img_bytes = pix.tobytes("png")
+        # Perform OCR
+        try:
+            result = reader.readtext(
+                img_bytes,
+                detail=0,           # Return only text strings
+                paragraph=True,     # Group into paragraphs
+                width_ths=0.7,      # Adjust for better line grouping
+                height_ths=0.7
+            )
+            page_text = "\n".join([line.strip() for line in result if line.strip()])
+        except Exception as ocr_error:
+            page_text = f"[OCR Error on page {page_num + 1}: {ocr_error}]"
+        # Only add page if meaningful text was extracted
+        if len(page_text) > min_text_length or "OCR Error" in page_text:
+            full_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n"
+    doc.close()
+    # Optional: Save to file
+    if save_to_file:
+        try:
+            with open(save_to_file, "w", encoding="utf-8") as f:
+                f.write(full_text)
+            print(f"Extracted text saved to: {save_to_file}")
+        except Exception as e:
+            print(f"Failed to save file: {e}")
+    return full_text.strip()
+# ———————— TEST USAGE ————————
+if __name__ == "__main__":
+    pdf_file = "Kbis.pdf"  # Update with your actual filename/path
+    extracted_text = extract_pdf_text_with_easyocr(
+        file_path=pdf_file,
+        zoom=3.5,                    # Slightly higher for very small text
+        save_to_file="kbis_extracted.txt"
+    )
+    # Print preview (first 1000 chars)
+    print("\n=== EXTRACTED TEXT PREVIEW ===\n")
+    print(extracted_text[:1000])
+    print("\n... (truncated)" if len(extracted_text) > 1000 else "")