File size: 5,582 Bytes
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import io
import re
from PIL import Image

# For text extraction from PDFs (non-OCR)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

# For image-based PDFs (OCR)
from pdf2image import convert_from_path
import pytesseract

# Import Tesseract configuration from config.py
from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH

# Set Tesseract command explicitly (uses ENV from Dockerfile or default)
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
if POPPLER_PATH:
    # This setting is usually only needed for local Windows development
    # where Poppler isn't in system PATH.
    # In Docker, Poppler should be in PATH via apt-get install.
    pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF. Tries direct text extraction first.
    If sparse text is found (suggesting image-based PDF), it performs OCR.
    """
    print(f"Attempting direct text extraction from: {pdf_path}")
    output_string = io.StringIO()
    with open(pdf_path, 'rb') as fp:
        try:
            extract_text_to_fp(fp, output_string, laparams=LAParams())
            text = output_string.getvalue()
            # If text is very short for a non-empty PDF, it might be image-based.
            # Using a threshold of 100 characters for extracted text and file size > 10KB.
            if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
                print("Direct extraction yielded sparse text. Attempting OCR...")
                return ocr_pdf(pdf_path)
            return text
        except Exception as e:
            print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
            return ocr_pdf(pdf_path)

def ocr_pdf(pdf_path: str) -> str:
    """
    Performs OCR on a PDF file using pdf2image and pytesseract.
    Requires Tesseract and Poppler to be installed and in system PATH.
    """
    all_text = []
    try:
        # Convert PDF pages to images. Higher DPI for better OCR.
        # Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
        images = convert_from_path(pdf_path, dpi=300) 

        print(f"  Performing OCR on {len(images)} pages...")
        for i, img in enumerate(images):
            # Tesseract language packs: 'eng' for English, 'tur' for Turkish
            # Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
            # if you need Arabic and French OCR.
            page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
            all_text.append(page_text)
            print(f"    Page {i+1} OCR complete.")

    except Exception as e:
        print(f"OCR process failed: {e}")
        print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
        return ""

    return "\n".join(all_text)

def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
    """
    Splits text into chunks of a maximum size with optional overlap.
    Aims to split by paragraphs/sentences first, then by word.
    """
    if not text:
        return []

    # Simple paragraph-based chunking
    paragraphs = re.split(r'\n\s*\n', text)
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for para in paragraphs:
        if not para.strip():
            continue

        # If adding paragraph plus a separator exceeds max_chunk_size,
        # or if the current_chunk is already substantial and adding this makes it too big,
        # then finalize the current chunk.
        if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
            if current_chunk: # Only append if current_chunk is not empty
                chunks.append("\n\n".join(current_chunk))
                current_chunk = []
                current_chunk_len = 0

            # If a single paragraph is larger than max_chunk_size, split it by words
            if len(para) > max_chunk_size:
                words = para.split(' ')
                sub_chunk = []
                sub_chunk_len = 0
                for word in words:
                    if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
                        chunks.append(" ".join(sub_chunk))
                        sub_chunk = [word]
                        sub_chunk_len = len(word)
                    else:
                        sub_chunk.append(word)
                        sub_chunk_len += len(word) + len(' ')
                if sub_chunk: # Add remaining sub-chunk
                    chunks.append(" ".join(sub_chunk))
            else: # Paragraph fits into a new chunk
                current_chunk.append(para)
                current_chunk_len += len(para) + len('\n\n')
        else: # Paragraph fits into the current chunk
            current_chunk.append(para)
            current_chunk_len += len(para) + len('\n\n')

    if current_chunk: # Add any remaining text
        chunks.append("\n\n".join(current_chunk))
    
    # Apply overlap: This is a simplistic overlap implementation.
    final_chunks_with_overlap = []
    for i in range(len(chunks)):
        chunk = chunks[i]
        if i > 0 and overlap > 0:
            # Take a portion of the previous chunk to overlap
            prev_chunk_part = chunks[i-1][-overlap:]
            chunk = prev_chunk_part + "\n" + chunk
        final_chunks_with_overlap.append(chunk)

    return final_chunks_with_overlap