Spaces:

Cassius1Morbant
/

French_Legal_Chatbot

Running

App Files Files Community

French_Legal_Chatbot / extract.py

Cassius1Morbant

Upload extract.py

593231c verified 3 months ago

raw

history blame contribute delete

2.98 kB

	import pymupdf # PyMuPDF
	import easyocr
	import io
	import os

	# Initialise EasyOCR once (French + English) — outside the function for efficiency
	# First run will download models (~400 MB); subsequent runs are fast.
	reader = easyocr.Reader(['fr', 'en'], gpu=False) # Set gpu=True if you have CUDA

	def extract_pdf_text_with_easyocr(
	file_path: str,
	zoom: float = 3.0, # 3.0 ≈ 300 DPI — good balance of quality/speed
	min_text_length: int = 50, # Ignore pages with very little text (likely blank)
	save_to_file: str = None # Optional: path to save extracted text
	) -> str:
	"""
	Extract text from a PDF using EasyOCR with OCR fallback for image-based pages.
	Returns clean concatenated text.
	"""
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"PDF file not found: {file_path}")

	try:
	doc = pymupdf.open(file_path)
	except Exception as e:
	raise RuntimeError(f"Failed to open PDF with PyMuPDF: {e}")

	full_text = ""
	total_pages = len(doc)

	for page_num in range(total_pages):
	page = doc[page_num]

	# Render page to high-resolution pixmap
	mat = pymupdf.Matrix(zoom, zoom)
	pix = page.get_pixmap(matrix=mat, colorspace=pymupdf.csRGB)

	# Convert to PNG bytes
	img_bytes = pix.tobytes("png")

	# Perform OCR
	try:
	result = reader.readtext(
	img_bytes,
	detail=0, # Return only text strings
	paragraph=True, # Group into paragraphs
	width_ths=0.7, # Adjust for better line grouping
	height_ths=0.7
	)
	page_text = "\n".join([line.strip() for line in result if line.strip()])
	except Exception as ocr_error:
	page_text = f"[OCR Error on page {page_num + 1}: {ocr_error}]"

	# Only add page if meaningful text was extracted
	if len(page_text) > min_text_length or "OCR Error" in page_text:
	full_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n"

	doc.close()

	# Optional: Save to file
	if save_to_file:
	try:
	with open(save_to_file, "w", encoding="utf-8") as f:
	f.write(full_text)
	print(f"Extracted text saved to: {save_to_file}")
	except Exception as e:
	print(f"Failed to save file: {e}")

	return full_text.strip()

	# ———————— TEST USAGE ————————
	if __name__ == "__main__":
	pdf_file = "Kbis.pdf" # Update with your actual filename/path

	extracted_text = extract_pdf_text_with_easyocr(
	file_path=pdf_file,
	zoom=3.5, # Slightly higher for very small text
	save_to_file="kbis_extracted.txt"
	)

	# Print preview (first 1000 chars)
	print("\n=== EXTRACTED TEXT PREVIEW ===\n")
	print(extracted_text[:1000])
	print("\n... (truncated)" if len(extracted_text) > 1000 else "")