CompassIA / src /pdf_processing.py
dembasowmr's picture
Reorganized the project: -Documents hosted on Firestore db, -conversations saved
15d9931
import os
import io
import re
from PIL import Image
# For text extraction from PDFs (non-OCR)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
# For image-based PDFs (OCR)
from pdf2image import convert_from_path
import pytesseract
# Import Tesseract configuration from config.py
from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH
# Set Tesseract command explicitly (uses ENV from Dockerfile or default)
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
if POPPLER_PATH:
# This setting is usually only needed for local Windows development
# where Poppler isn't in system PATH.
# In Docker, Poppler should be in PATH via apt-get install.
pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path
def extract_text_from_pdf(pdf_path: str) -> str:
"""
Extracts text from a PDF. Tries direct text extraction first.
If sparse text is found (suggesting image-based PDF), it performs OCR.
"""
print(f"Attempting direct text extraction from: {pdf_path}")
output_string = io.StringIO()
with open(pdf_path, 'rb') as fp:
try:
extract_text_to_fp(fp, output_string, laparams=LAParams())
text = output_string.getvalue()
# If text is very short for a non-empty PDF, it might be image-based.
# Using a threshold of 100 characters for extracted text and file size > 10KB.
if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
print("Direct extraction yielded sparse text. Attempting OCR...")
return ocr_pdf(pdf_path)
return text
except Exception as e:
print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
return ocr_pdf(pdf_path)
def ocr_pdf(pdf_path: str) -> str:
"""
Performs OCR on a PDF file using pdf2image and pytesseract.
Requires Tesseract and Poppler to be installed and in system PATH.
"""
all_text = []
try:
# Convert PDF pages to images. Higher DPI for better OCR.
# Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
images = convert_from_path(pdf_path, dpi=300)
print(f" Performing OCR on {len(images)} pages...")
for i, img in enumerate(images):
# Tesseract language packs: 'eng' for English, 'tur' for Turkish
# Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
# if you need Arabic and French OCR.
page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
all_text.append(page_text)
print(f" Page {i+1} OCR complete.")
except Exception as e:
print(f"OCR process failed: {e}")
print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
return ""
return "\n".join(all_text)
def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
"""
Splits text into chunks of a maximum size with optional overlap.
Aims to split by paragraphs/sentences first, then by word.
"""
if not text:
return []
# Simple paragraph-based chunking
paragraphs = re.split(r'\n\s*\n', text)
chunks = []
current_chunk = []
current_chunk_len = 0
for para in paragraphs:
if not para.strip():
continue
# If adding paragraph plus a separator exceeds max_chunk_size,
# or if the current_chunk is already substantial and adding this makes it too big,
# then finalize the current chunk.
if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
if current_chunk: # Only append if current_chunk is not empty
chunks.append("\n\n".join(current_chunk))
current_chunk = []
current_chunk_len = 0
# If a single paragraph is larger than max_chunk_size, split it by words
if len(para) > max_chunk_size:
words = para.split(' ')
sub_chunk = []
sub_chunk_len = 0
for word in words:
if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
chunks.append(" ".join(sub_chunk))
sub_chunk = [word]
sub_chunk_len = len(word)
else:
sub_chunk.append(word)
sub_chunk_len += len(word) + len(' ')
if sub_chunk: # Add remaining sub-chunk
chunks.append(" ".join(sub_chunk))
else: # Paragraph fits into a new chunk
current_chunk.append(para)
current_chunk_len += len(para) + len('\n\n')
else: # Paragraph fits into the current chunk
current_chunk.append(para)
current_chunk_len += len(para) + len('\n\n')
if current_chunk: # Add any remaining text
chunks.append("\n\n".join(current_chunk))
# Apply overlap: This is a simplistic overlap implementation.
final_chunks_with_overlap = []
for i in range(len(chunks)):
chunk = chunks[i]
if i > 0 and overlap > 0:
# Take a portion of the previous chunk to overlap
prev_chunk_part = chunks[i-1][-overlap:]
chunk = prev_chunk_part + "\n" + chunk
final_chunks_with_overlap.append(chunk)
return final_chunks_with_overlap