from PIL import Image import pytesseract import pdfplumber import io from fastapi import UploadFile from pdf2image import convert_from_bytes import uuid import nltk from nltk.tokenize import sent_tokenize from app.config import params import shutil import os try: nltk.data.find("tokenizers/punkt") except LookupError: if os.environ.get("ENV") != "production": # safe for local dev nltk.download("punkt") TESSERACT_AVAILABLE = shutil.which("tesseract") is not None if not TESSERACT_AVAILABLE: print("⚠️ Warning: Tesseract not found in PATH — OCR will be skipped.") def preprocess_image(image: Image.Image) -> Image.Image: """ Preprocess PIL Image for better OCR accuracy: convert to grayscale and apply threshold. """ gray = image.convert("L") # grayscale thresh = params["ocr"]["threshold"] bw = gray.point(lambda x: 0 if x < 140 else 255, '1') # simple binary threshold return bw async def extract_chunks_from_file(file: UploadFile) -> list[dict]: """ Extract sentence-level chunks with metadata from PDF or image file. Returns a list of dicts: {'doc_id', 'filename', 'page', 'sentence', 'text'}. """ content = await file.read() filename = file.filename.lower() file_id = f"DOC{str(uuid.uuid4())[:5].upper()}" # Short custom doc ID chunks = [] def chunk_sentences(text, doc_id, page_number, filename): sentences = sent_tokenize(text) for sent_number, sentence in enumerate(sentences, start=1): clean_sentence = sentence.strip().replace("\n", " ") if clean_sentence: chunks.append({ "doc_id": doc_id, "filename": filename, "page": page_number, "sentence": sent_number, "text": clean_sentence, "text_length": len(clean_sentence) }) dpi = params["ocr"]["dpi"] psm = params["ocr"]["tesseract_psm"] if filename.endswith(".pdf"): try: with pdfplumber.open(io.BytesIO(content)) as pdf: print(f"🧾 PDF opened: {filename}") for page_number, page in enumerate(pdf.pages, start=1): page_text = page.extract_text() if page_text: print(f"📄 Page {page_number} text preview: {repr(page_text[:100])}") chunk_sentences(page_text, file_id, page_number, file.filename) else: print(f"⚠️ Page {page_number} has no text.") if TESSERACT_AVAILABLE: print(" ↪️ Falling back to OCR on this page") img = page.to_image(resolution=dpi).original img = preprocess_image(img) try: ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}') print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}") chunk_sentences(ocr_text, file_id, page_number, file.filename) except pytesseract.TesseractNotFoundError: print(f"❌ Tesseract not found at OCR time.") else: print(" ↪️ Skipping OCR (tesseract missing)") except Exception as e: print(f"❌ PDFPlumber error for {filename}: {e}") if TESSERACT_AVAILABLE: print("📸 OCR fallback for entire PDF...") images = convert_from_bytes(content, dpi=dpi) for page_number, img in enumerate(images, start=1): img = preprocess_image(img) try: ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}') print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}") chunk_sentences(ocr_text, file_id, page_number, file.filename) except pytesseract.TesseractNotFoundError: print(f"❌ Tesseract not found at OCR time.") else: print(" ↪️ Skipping full-PDF OCR (tesseract missing)") elif filename.endswith((".png", ".jpg", ".jpeg")): print(f"🖼️ Image file detected: {filename}") if TESSERACT_AVAILABLE: image = Image.open(io.BytesIO(content)) image = preprocess_image(image) try: ocr_text = pytesseract.image_to_string(image, config=f'--psm {psm}') print(f"🖨️ OCR text preview: {repr(ocr_text[:100])}") chunk_sentences(ocr_text, file_id, page_number=1, filename=file.filename) except pytesseract.TesseractNotFoundError: print(f"❌ Tesseract not found at OCR time.") else: print(" ↪️ Skipping OCR on image (tesseract missing)") print(f"✅ Extracted {len(chunks)} chunks from {filename}") return chunks