Spaces:
Runtime error
Runtime error
| import os | |
| import io | |
| import re | |
| from PIL import Image | |
| # For text extraction from PDFs (non-OCR) | |
| from pdfminer.high_level import extract_text_to_fp | |
| from pdfminer.layout import LAParams | |
| # For image-based PDFs (OCR) | |
| from pdf2image import convert_from_path | |
| import pytesseract | |
| # Import Tesseract configuration from config.py | |
| from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH | |
| # Set Tesseract command explicitly (uses ENV from Dockerfile or default) | |
| pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD | |
| if POPPLER_PATH: | |
| # This setting is usually only needed for local Windows development | |
| # where Poppler isn't in system PATH. | |
| # In Docker, Poppler should be in PATH via apt-get install. | |
| pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """ | |
| Extracts text from a PDF. Tries direct text extraction first. | |
| If sparse text is found (suggesting image-based PDF), it performs OCR. | |
| """ | |
| print(f"Attempting direct text extraction from: {pdf_path}") | |
| output_string = io.StringIO() | |
| with open(pdf_path, 'rb') as fp: | |
| try: | |
| extract_text_to_fp(fp, output_string, laparams=LAParams()) | |
| text = output_string.getvalue() | |
| # If text is very short for a non-empty PDF, it might be image-based. | |
| # Using a threshold of 100 characters for extracted text and file size > 10KB. | |
| if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000: | |
| print("Direct extraction yielded sparse text. Attempting OCR...") | |
| return ocr_pdf(pdf_path) | |
| return text | |
| except Exception as e: | |
| print(f"Direct PDF text extraction failed ({e}). Attempting OCR...") | |
| return ocr_pdf(pdf_path) | |
| def ocr_pdf(pdf_path: str) -> str: | |
| """ | |
| Performs OCR on a PDF file using pdf2image and pytesseract. | |
| Requires Tesseract and Poppler to be installed and in system PATH. | |
| """ | |
| all_text = [] | |
| try: | |
| # Convert PDF pages to images. Higher DPI for better OCR. | |
| # Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH) | |
| images = convert_from_path(pdf_path, dpi=300) | |
| print(f" Performing OCR on {len(images)} pages...") | |
| for i, img in enumerate(images): | |
| # Tesseract language packs: 'eng' for English, 'tur' for Turkish | |
| # Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra' | |
| # if you need Arabic and French OCR. | |
| page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages | |
| all_text.append(page_text) | |
| print(f" Page {i+1} OCR complete.") | |
| except Exception as e: | |
| print(f"OCR process failed: {e}") | |
| print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.") | |
| return "" | |
| return "\n".join(all_text) | |
| def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]: | |
| """ | |
| Splits text into chunks of a maximum size with optional overlap. | |
| Aims to split by paragraphs/sentences first, then by word. | |
| """ | |
| if not text: | |
| return [] | |
| # Simple paragraph-based chunking | |
| paragraphs = re.split(r'\n\s*\n', text) | |
| chunks = [] | |
| current_chunk = [] | |
| current_chunk_len = 0 | |
| for para in paragraphs: | |
| if not para.strip(): | |
| continue | |
| # If adding paragraph plus a separator exceeds max_chunk_size, | |
| # or if the current_chunk is already substantial and adding this makes it too big, | |
| # then finalize the current chunk. | |
| if current_chunk_len + len(para) + len('\n\n') > max_chunk_size: | |
| if current_chunk: # Only append if current_chunk is not empty | |
| chunks.append("\n\n".join(current_chunk)) | |
| current_chunk = [] | |
| current_chunk_len = 0 | |
| # If a single paragraph is larger than max_chunk_size, split it by words | |
| if len(para) > max_chunk_size: | |
| words = para.split(' ') | |
| sub_chunk = [] | |
| sub_chunk_len = 0 | |
| for word in words: | |
| if sub_chunk_len + len(word) + len(' ') > max_chunk_size: | |
| chunks.append(" ".join(sub_chunk)) | |
| sub_chunk = [word] | |
| sub_chunk_len = len(word) | |
| else: | |
| sub_chunk.append(word) | |
| sub_chunk_len += len(word) + len(' ') | |
| if sub_chunk: # Add remaining sub-chunk | |
| chunks.append(" ".join(sub_chunk)) | |
| else: # Paragraph fits into a new chunk | |
| current_chunk.append(para) | |
| current_chunk_len += len(para) + len('\n\n') | |
| else: # Paragraph fits into the current chunk | |
| current_chunk.append(para) | |
| current_chunk_len += len(para) + len('\n\n') | |
| if current_chunk: # Add any remaining text | |
| chunks.append("\n\n".join(current_chunk)) | |
| # Apply overlap: This is a simplistic overlap implementation. | |
| final_chunks_with_overlap = [] | |
| for i in range(len(chunks)): | |
| chunk = chunks[i] | |
| if i > 0 and overlap > 0: | |
| # Take a portion of the previous chunk to overlap | |
| prev_chunk_part = chunks[i-1][-overlap:] | |
| chunk = prev_chunk_part + "\n" + chunk | |
| final_chunks_with_overlap.append(chunk) | |
| return final_chunks_with_overlap |