Spaces:

dembasowmr
/

CompassIA

Runtime error

File size: 5,582 Bytes

15d9931

import os
import io
import re
from PIL import Image

# For text extraction from PDFs (non-OCR)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

# For image-based PDFs (OCR)
from pdf2image import convert_from_path
import pytesseract

# Import Tesseract configuration from config.py
from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH

# Set Tesseract command explicitly (uses ENV from Dockerfile or default)
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
if POPPLER_PATH:
    # This setting is usually only needed for local Windows development
    # where Poppler isn't in system PATH.
    # In Docker, Poppler should be in PATH via apt-get install.
    pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF. Tries direct text extraction first.
    If sparse text is found (suggesting image-based PDF), it performs OCR.
    """
    print(f"Attempting direct text extraction from: {pdf_path}")
    output_string = io.StringIO()
    with open(pdf_path, 'rb') as fp:
        try:
            extract_text_to_fp(fp, output_string, laparams=LAParams())
            text = output_string.getvalue()
            # If text is very short for a non-empty PDF, it might be image-based.
            # Using a threshold of 100 characters for extracted text and file size > 10KB.
            if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
                print("Direct extraction yielded sparse text. Attempting OCR...")
                return ocr_pdf(pdf_path)
            return text
        except Exception as e:
            print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
            return ocr_pdf(pdf_path)

def ocr_pdf(pdf_path: str) -> str:
    """
    Performs OCR on a PDF file using pdf2image and pytesseract.
    Requires Tesseract and Poppler to be installed and in system PATH.
    """
    all_text = []
    try:
        # Convert PDF pages to images. Higher DPI for better OCR.
        # Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
        images = convert_from_path(pdf_path, dpi=300) 

        print(f"  Performing OCR on {len(images)} pages...")
        for i, img in enumerate(images):
            # Tesseract language packs: 'eng' for English, 'tur' for Turkish
            # Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
            # if you need Arabic and French OCR.
            page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
            all_text.append(page_text)
            print(f"    Page {i+1} OCR complete.")

    except Exception as e:
        print(f"OCR process failed: {e}")
        print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
        return ""

    return "\n".join(all_text)

def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
    """
    Splits text into chunks of a maximum size with optional overlap.
    Aims to split by paragraphs/sentences first, then by word.
    """
    if not text:
        return []

    # Simple paragraph-based chunking
    paragraphs = re.split(r'\n\s*\n', text)
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for para in paragraphs:
        if not para.strip():
            continue

        # If adding paragraph plus a separator exceeds max_chunk_size,
        # or if the current_chunk is already substantial and adding this makes it too big,
        # then finalize the current chunk.
        if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
            if current_chunk: # Only append if current_chunk is not empty
                chunks.append("\n\n".join(current_chunk))
                current_chunk = []
                current_chunk_len = 0

            # If a single paragraph is larger than max_chunk_size, split it by words
            if len(para) > max_chunk_size:
                words = para.split(' ')
                sub_chunk = []
                sub_chunk_len = 0
                for word in words:
                    if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
                        chunks.append(" ".join(sub_chunk))
                        sub_chunk = [word]
                        sub_chunk_len = len(word)
                    else:
                        sub_chunk.append(word)
                        sub_chunk_len += len(word) + len(' ')
                if sub_chunk: # Add remaining sub-chunk
                    chunks.append(" ".join(sub_chunk))
            else: # Paragraph fits into a new chunk
                current_chunk.append(para)
                current_chunk_len += len(para) + len('\n\n')
        else: # Paragraph fits into the current chunk
            current_chunk.append(para)
            current_chunk_len += len(para) + len('\n\n')

    if current_chunk: # Add any remaining text
        chunks.append("\n\n".join(current_chunk))
    
    # Apply overlap: This is a simplistic overlap implementation.
    final_chunks_with_overlap = []
    for i in range(len(chunks)):
        chunk = chunks[i]
        if i > 0 and overlap > 0:
            # Take a portion of the previous chunk to overlap
            prev_chunk_part = chunks[i-1][-overlap:]
            chunk = prev_chunk_part + "\n" + chunk
        final_chunks_with_overlap.append(chunk)

    return final_chunks_with_overlap