Spaces:
Runtime error
Runtime error
File size: 5,582 Bytes
15d9931 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import os
import io
import re
from PIL import Image
# For text extraction from PDFs (non-OCR)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
# For image-based PDFs (OCR)
from pdf2image import convert_from_path
import pytesseract
# Import Tesseract configuration from config.py
from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH
# Set Tesseract command explicitly (uses ENV from Dockerfile or default)
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
if POPPLER_PATH:
# This setting is usually only needed for local Windows development
# where Poppler isn't in system PATH.
# In Docker, Poppler should be in PATH via apt-get install.
pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path
def extract_text_from_pdf(pdf_path: str) -> str:
"""
Extracts text from a PDF. Tries direct text extraction first.
If sparse text is found (suggesting image-based PDF), it performs OCR.
"""
print(f"Attempting direct text extraction from: {pdf_path}")
output_string = io.StringIO()
with open(pdf_path, 'rb') as fp:
try:
extract_text_to_fp(fp, output_string, laparams=LAParams())
text = output_string.getvalue()
# If text is very short for a non-empty PDF, it might be image-based.
# Using a threshold of 100 characters for extracted text and file size > 10KB.
if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
print("Direct extraction yielded sparse text. Attempting OCR...")
return ocr_pdf(pdf_path)
return text
except Exception as e:
print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
return ocr_pdf(pdf_path)
def ocr_pdf(pdf_path: str) -> str:
"""
Performs OCR on a PDF file using pdf2image and pytesseract.
Requires Tesseract and Poppler to be installed and in system PATH.
"""
all_text = []
try:
# Convert PDF pages to images. Higher DPI for better OCR.
# Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
images = convert_from_path(pdf_path, dpi=300)
print(f" Performing OCR on {len(images)} pages...")
for i, img in enumerate(images):
# Tesseract language packs: 'eng' for English, 'tur' for Turkish
# Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
# if you need Arabic and French OCR.
page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
all_text.append(page_text)
print(f" Page {i+1} OCR complete.")
except Exception as e:
print(f"OCR process failed: {e}")
print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
return ""
return "\n".join(all_text)
def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
"""
Splits text into chunks of a maximum size with optional overlap.
Aims to split by paragraphs/sentences first, then by word.
"""
if not text:
return []
# Simple paragraph-based chunking
paragraphs = re.split(r'\n\s*\n', text)
chunks = []
current_chunk = []
current_chunk_len = 0
for para in paragraphs:
if not para.strip():
continue
# If adding paragraph plus a separator exceeds max_chunk_size,
# or if the current_chunk is already substantial and adding this makes it too big,
# then finalize the current chunk.
if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
if current_chunk: # Only append if current_chunk is not empty
chunks.append("\n\n".join(current_chunk))
current_chunk = []
current_chunk_len = 0
# If a single paragraph is larger than max_chunk_size, split it by words
if len(para) > max_chunk_size:
words = para.split(' ')
sub_chunk = []
sub_chunk_len = 0
for word in words:
if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
chunks.append(" ".join(sub_chunk))
sub_chunk = [word]
sub_chunk_len = len(word)
else:
sub_chunk.append(word)
sub_chunk_len += len(word) + len(' ')
if sub_chunk: # Add remaining sub-chunk
chunks.append(" ".join(sub_chunk))
else: # Paragraph fits into a new chunk
current_chunk.append(para)
current_chunk_len += len(para) + len('\n\n')
else: # Paragraph fits into the current chunk
current_chunk.append(para)
current_chunk_len += len(para) + len('\n\n')
if current_chunk: # Add any remaining text
chunks.append("\n\n".join(current_chunk))
# Apply overlap: This is a simplistic overlap implementation.
final_chunks_with_overlap = []
for i in range(len(chunks)):
chunk = chunks[i]
if i > 0 and overlap > 0:
# Take a portion of the previous chunk to overlap
prev_chunk_part = chunks[i-1][-overlap:]
chunk = prev_chunk_part + "\n" + chunk
final_chunks_with_overlap.append(chunk)
return final_chunks_with_overlap |