French_Legal_Chatbot / extract.py
Cassius1Morbant's picture
Upload extract.py
593231c verified
import pymupdf # PyMuPDF
import easyocr
import io
import os
# Initialise EasyOCR once (French + English) β€” outside the function for efficiency
# First run will download models (~400 MB); subsequent runs are fast.
reader = easyocr.Reader(['fr', 'en'], gpu=False) # Set gpu=True if you have CUDA
def extract_pdf_text_with_easyocr(
file_path: str,
zoom: float = 3.0, # 3.0 β‰ˆ 300 DPI β€” good balance of quality/speed
min_text_length: int = 50, # Ignore pages with very little text (likely blank)
save_to_file: str = None # Optional: path to save extracted text
) -> str:
"""
Extract text from a PDF using EasyOCR with OCR fallback for image-based pages.
Returns clean concatenated text.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"PDF file not found: {file_path}")
try:
doc = pymupdf.open(file_path)
except Exception as e:
raise RuntimeError(f"Failed to open PDF with PyMuPDF: {e}")
full_text = ""
total_pages = len(doc)
for page_num in range(total_pages):
page = doc[page_num]
# Render page to high-resolution pixmap
mat = pymupdf.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, colorspace=pymupdf.csRGB)
# Convert to PNG bytes
img_bytes = pix.tobytes("png")
# Perform OCR
try:
result = reader.readtext(
img_bytes,
detail=0, # Return only text strings
paragraph=True, # Group into paragraphs
width_ths=0.7, # Adjust for better line grouping
height_ths=0.7
)
page_text = "\n".join([line.strip() for line in result if line.strip()])
except Exception as ocr_error:
page_text = f"[OCR Error on page {page_num + 1}: {ocr_error}]"
# Only add page if meaningful text was extracted
if len(page_text) > min_text_length or "OCR Error" in page_text:
full_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n"
doc.close()
# Optional: Save to file
if save_to_file:
try:
with open(save_to_file, "w", encoding="utf-8") as f:
f.write(full_text)
print(f"Extracted text saved to: {save_to_file}")
except Exception as e:
print(f"Failed to save file: {e}")
return full_text.strip()
# β€”β€”β€”β€”β€”β€”β€”β€” TEST USAGE β€”β€”β€”β€”β€”β€”β€”β€”
if __name__ == "__main__":
pdf_file = "Kbis.pdf" # Update with your actual filename/path
extracted_text = extract_pdf_text_with_easyocr(
file_path=pdf_file,
zoom=3.5, # Slightly higher for very small text
save_to_file="kbis_extracted.txt"
)
# Print preview (first 1000 chars)
print("\n=== EXTRACTED TEXT PREVIEW ===\n")
print(extracted_text[:1000])
print("\n... (truncated)" if len(extracted_text) > 1000 else "")