File size: 5,254 Bytes
c8769a7
 
 
 
 
 
 
 
 
 
3ee1c38
 
c8769a7
3ee1c38
 
 
 
 
c8769a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from PIL import Image
import pytesseract
import pdfplumber
import io
from fastapi import UploadFile
from pdf2image import convert_from_bytes
import uuid
import nltk
from nltk.tokenize import sent_tokenize
from app.config import params
import shutil
import os     

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    if os.environ.get("ENV") != "production":  # safe for local dev
        nltk.download("punkt")

TESSERACT_AVAILABLE = shutil.which("tesseract") is not None
if not TESSERACT_AVAILABLE:
    print("⚠️ Warning: Tesseract not found in PATH — OCR will be skipped.")

def preprocess_image(image: Image.Image) -> Image.Image:
    """
    Preprocess PIL Image for better OCR accuracy: convert to grayscale and apply threshold.
    """
    gray = image.convert("L")  # grayscale
    thresh = params["ocr"]["threshold"]
    bw = gray.point(lambda x: 0 if x < 140 else 255, '1')  # simple binary threshold
    return bw

async def extract_chunks_from_file(file: UploadFile) -> list[dict]:
    """
    Extract sentence-level chunks with metadata from PDF or image file.
    Returns a list of dicts: {'doc_id', 'filename', 'page', 'sentence', 'text'}.
    """
    content = await file.read()
    filename = file.filename.lower()
    file_id = f"DOC{str(uuid.uuid4())[:5].upper()}"  # Short custom doc ID
    chunks = []

    def chunk_sentences(text, doc_id, page_number, filename):
        sentences = sent_tokenize(text)
        for sent_number, sentence in enumerate(sentences, start=1):
            clean_sentence = sentence.strip().replace("\n", " ")
            if clean_sentence:
                chunks.append({
                    "doc_id": doc_id,
                    "filename": filename,
                    "page": page_number,
                    "sentence": sent_number,
                    "text": clean_sentence,
                    "text_length": len(clean_sentence)
                })
                
    dpi = params["ocr"]["dpi"]
    psm = params["ocr"]["tesseract_psm"]
    
    if filename.endswith(".pdf"):
        try:
            with pdfplumber.open(io.BytesIO(content)) as pdf:
                print(f"🧾 PDF opened: {filename}")
                for page_number, page in enumerate(pdf.pages, start=1):
                    page_text = page.extract_text()
                    if page_text:
                        print(f"📄 Page {page_number} text preview: {repr(page_text[:100])}")
                        chunk_sentences(page_text, file_id, page_number, file.filename)
                    else:
                        print(f"⚠️ Page {page_number} has no text.")
                        if TESSERACT_AVAILABLE:                       
                            print("   ↪️ Falling back to OCR on this page")
                            img = page.to_image(resolution=dpi).original
                            img = preprocess_image(img)
                            try:
                                ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}')
                                print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}")
                                chunk_sentences(ocr_text, file_id, page_number, file.filename)
                            except pytesseract.TesseractNotFoundError:
                                print(f"❌ Tesseract not found at OCR time.")
                        else:
                            print("   ↪️ Skipping OCR (tesseract missing)")
        except Exception as e:
            print(f"❌ PDFPlumber error for {filename}: {e}")
            if TESSERACT_AVAILABLE:                           
                print("📸 OCR fallback for entire PDF...")
                images = convert_from_bytes(content, dpi=dpi)
                for page_number, img in enumerate(images, start=1):
                    img = preprocess_image(img)
                    try:
                        ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}')
                        print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}")
                        chunk_sentences(ocr_text, file_id, page_number, file.filename)
                    except pytesseract.TesseractNotFoundError:
                        print(f"❌ Tesseract not found at OCR time.")
            else:
                print("   ↪️ Skipping full-PDF OCR (tesseract missing)")

    elif filename.endswith((".png", ".jpg", ".jpeg")):
        print(f"🖼️ Image file detected: {filename}")
        if TESSERACT_AVAILABLE:                              
            image = Image.open(io.BytesIO(content))
            image = preprocess_image(image)
            try:
                ocr_text = pytesseract.image_to_string(image, config=f'--psm {psm}')
                print(f"🖨️ OCR text preview: {repr(ocr_text[:100])}")
                chunk_sentences(ocr_text, file_id, page_number=1, filename=file.filename)
            except pytesseract.TesseractNotFoundError:
                print(f"❌ Tesseract not found at OCR time.")
        else:
            print("   ↪️ Skipping OCR on image (tesseract missing)")

    print(f"✅ Extracted {len(chunks)} chunks from {filename}")
    return chunks