Spaces:

dembasowmr
/

CompassIA

Runtime error

App Files Files Community

CompassIA / src /pdf_processing.py

dembasowmr

Reorganized the project: -Documents hosted on Firestore db, -conversations saved

15d9931 10 months ago

raw

history blame contribute delete

5.58 kB

	import os
	import io
	import re
	from PIL import Image

	# For text extraction from PDFs (non-OCR)
	from pdfminer.high_level import extract_text_to_fp
	from pdfminer.layout import LAParams

	# For image-based PDFs (OCR)
	from pdf2image import convert_from_path
	import pytesseract

	# Import Tesseract configuration from config.py
	from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH

	# Set Tesseract command explicitly (uses ENV from Dockerfile or default)
	pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
	if POPPLER_PATH:
	# This setting is usually only needed for local Windows development
	# where Poppler isn't in system PATH.
	# In Docker, Poppler should be in PATH via apt-get install.
	pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path


	def extract_text_from_pdf(pdf_path: str) -> str:
	"""
	Extracts text from a PDF. Tries direct text extraction first.
	If sparse text is found (suggesting image-based PDF), it performs OCR.
	"""
	print(f"Attempting direct text extraction from: {pdf_path}")
	output_string = io.StringIO()
	with open(pdf_path, 'rb') as fp:
	try:
	extract_text_to_fp(fp, output_string, laparams=LAParams())
	text = output_string.getvalue()
	# If text is very short for a non-empty PDF, it might be image-based.
	# Using a threshold of 100 characters for extracted text and file size > 10KB.
	if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
	print("Direct extraction yielded sparse text. Attempting OCR...")
	return ocr_pdf(pdf_path)
	return text
	except Exception as e:
	print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
	return ocr_pdf(pdf_path)

	def ocr_pdf(pdf_path: str) -> str:
	"""
	Performs OCR on a PDF file using pdf2image and pytesseract.
	Requires Tesseract and Poppler to be installed and in system PATH.
	"""
	all_text = []
	try:
	# Convert PDF pages to images. Higher DPI for better OCR.
	# Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
	images = convert_from_path(pdf_path, dpi=300)

	print(f" Performing OCR on {len(images)} pages...")
	for i, img in enumerate(images):
	# Tesseract language packs: 'eng' for English, 'tur' for Turkish
	# Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
	# if you need Arabic and French OCR.
	page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
	all_text.append(page_text)
	print(f" Page {i+1} OCR complete.")

	except Exception as e:
	print(f"OCR process failed: {e}")
	print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
	return ""

	return "\n".join(all_text)

	def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
	"""
	Splits text into chunks of a maximum size with optional overlap.
	Aims to split by paragraphs/sentences first, then by word.
	"""
	if not text:
	return []

	# Simple paragraph-based chunking
	paragraphs = re.split(r'\n\s*\n', text)
	chunks = []
	current_chunk = []
	current_chunk_len = 0

	for para in paragraphs:
	if not para.strip():
	continue

	# If adding paragraph plus a separator exceeds max_chunk_size,
	# or if the current_chunk is already substantial and adding this makes it too big,
	# then finalize the current chunk.
	if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
	if current_chunk: # Only append if current_chunk is not empty
	chunks.append("\n\n".join(current_chunk))
	current_chunk = []
	current_chunk_len = 0

	# If a single paragraph is larger than max_chunk_size, split it by words
	if len(para) > max_chunk_size:
	words = para.split(' ')
	sub_chunk = []
	sub_chunk_len = 0
	for word in words:
	if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
	chunks.append(" ".join(sub_chunk))
	sub_chunk = [word]
	sub_chunk_len = len(word)
	else:
	sub_chunk.append(word)
	sub_chunk_len += len(word) + len(' ')
	if sub_chunk: # Add remaining sub-chunk
	chunks.append(" ".join(sub_chunk))
	else: # Paragraph fits into a new chunk
	current_chunk.append(para)
	current_chunk_len += len(para) + len('\n\n')
	else: # Paragraph fits into the current chunk
	current_chunk.append(para)
	current_chunk_len += len(para) + len('\n\n')

	if current_chunk: # Add any remaining text
	chunks.append("\n\n".join(current_chunk))

	# Apply overlap: This is a simplistic overlap implementation.
	final_chunks_with_overlap = []
	for i in range(len(chunks)):
	chunk = chunks[i]
	if i > 0 and overlap > 0:
	# Take a portion of the previous chunk to overlap
	prev_chunk_part = chunks[i-1][-overlap:]
	chunk = prev_chunk_part + "\n" + chunk
	final_chunks_with_overlap.append(chunk)

	return final_chunks_with_overlap