import os import io import re from PIL import Image # For text extraction from PDFs (non-OCR) from pdfminer.high_level import extract_text_to_fp from pdfminer.layout import LAParams # For image-based PDFs (OCR) from pdf2image import convert_from_path import pytesseract # Import Tesseract configuration from config.py from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH # Set Tesseract command explicitly (uses ENV from Dockerfile or default) pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD if POPPLER_PATH: # This setting is usually only needed for local Windows development # where Poppler isn't in system PATH. # In Docker, Poppler should be in PATH via apt-get install. pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path def extract_text_from_pdf(pdf_path: str) -> str: """ Extracts text from a PDF. Tries direct text extraction first. If sparse text is found (suggesting image-based PDF), it performs OCR. """ print(f"Attempting direct text extraction from: {pdf_path}") output_string = io.StringIO() with open(pdf_path, 'rb') as fp: try: extract_text_to_fp(fp, output_string, laparams=LAParams()) text = output_string.getvalue() # If text is very short for a non-empty PDF, it might be image-based. # Using a threshold of 100 characters for extracted text and file size > 10KB. if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000: print("Direct extraction yielded sparse text. Attempting OCR...") return ocr_pdf(pdf_path) return text except Exception as e: print(f"Direct PDF text extraction failed ({e}). Attempting OCR...") return ocr_pdf(pdf_path) def ocr_pdf(pdf_path: str) -> str: """ Performs OCR on a PDF file using pdf2image and pytesseract. Requires Tesseract and Poppler to be installed and in system PATH. """ all_text = [] try: # Convert PDF pages to images. Higher DPI for better OCR. # Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH) images = convert_from_path(pdf_path, dpi=300) print(f" Performing OCR on {len(images)} pages...") for i, img in enumerate(images): # Tesseract language packs: 'eng' for English, 'tur' for Turkish # Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra' # if you need Arabic and French OCR. page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages all_text.append(page_text) print(f" Page {i+1} OCR complete.") except Exception as e: print(f"OCR process failed: {e}") print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.") return "" return "\n".join(all_text) def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]: """ Splits text into chunks of a maximum size with optional overlap. Aims to split by paragraphs/sentences first, then by word. """ if not text: return [] # Simple paragraph-based chunking paragraphs = re.split(r'\n\s*\n', text) chunks = [] current_chunk = [] current_chunk_len = 0 for para in paragraphs: if not para.strip(): continue # If adding paragraph plus a separator exceeds max_chunk_size, # or if the current_chunk is already substantial and adding this makes it too big, # then finalize the current chunk. if current_chunk_len + len(para) + len('\n\n') > max_chunk_size: if current_chunk: # Only append if current_chunk is not empty chunks.append("\n\n".join(current_chunk)) current_chunk = [] current_chunk_len = 0 # If a single paragraph is larger than max_chunk_size, split it by words if len(para) > max_chunk_size: words = para.split(' ') sub_chunk = [] sub_chunk_len = 0 for word in words: if sub_chunk_len + len(word) + len(' ') > max_chunk_size: chunks.append(" ".join(sub_chunk)) sub_chunk = [word] sub_chunk_len = len(word) else: sub_chunk.append(word) sub_chunk_len += len(word) + len(' ') if sub_chunk: # Add remaining sub-chunk chunks.append(" ".join(sub_chunk)) else: # Paragraph fits into a new chunk current_chunk.append(para) current_chunk_len += len(para) + len('\n\n') else: # Paragraph fits into the current chunk current_chunk.append(para) current_chunk_len += len(para) + len('\n\n') if current_chunk: # Add any remaining text chunks.append("\n\n".join(current_chunk)) # Apply overlap: This is a simplistic overlap implementation. final_chunks_with_overlap = [] for i in range(len(chunks)): chunk = chunks[i] if i > 0 and overlap > 0: # Take a portion of the previous chunk to overlap prev_chunk_part = chunks[i-1][-overlap:] chunk = prev_chunk_part + "\n" + chunk final_chunks_with_overlap.append(chunk) return final_chunks_with_overlap