wasserstoff / app /services /file_handler.py
adityasarade's picture
Downloading nltk during runtime
3ee1c38
from PIL import Image
import pytesseract
import pdfplumber
import io
from fastapi import UploadFile
from pdf2image import convert_from_bytes
import uuid
import nltk
from nltk.tokenize import sent_tokenize
from app.config import params
import shutil
import os
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
if os.environ.get("ENV") != "production": # safe for local dev
nltk.download("punkt")
TESSERACT_AVAILABLE = shutil.which("tesseract") is not None
if not TESSERACT_AVAILABLE:
print("⚠️ Warning: Tesseract not found in PATH — OCR will be skipped.")
def preprocess_image(image: Image.Image) -> Image.Image:
"""
Preprocess PIL Image for better OCR accuracy: convert to grayscale and apply threshold.
"""
gray = image.convert("L") # grayscale
thresh = params["ocr"]["threshold"]
bw = gray.point(lambda x: 0 if x < 140 else 255, '1') # simple binary threshold
return bw
async def extract_chunks_from_file(file: UploadFile) -> list[dict]:
"""
Extract sentence-level chunks with metadata from PDF or image file.
Returns a list of dicts: {'doc_id', 'filename', 'page', 'sentence', 'text'}.
"""
content = await file.read()
filename = file.filename.lower()
file_id = f"DOC{str(uuid.uuid4())[:5].upper()}" # Short custom doc ID
chunks = []
def chunk_sentences(text, doc_id, page_number, filename):
sentences = sent_tokenize(text)
for sent_number, sentence in enumerate(sentences, start=1):
clean_sentence = sentence.strip().replace("\n", " ")
if clean_sentence:
chunks.append({
"doc_id": doc_id,
"filename": filename,
"page": page_number,
"sentence": sent_number,
"text": clean_sentence,
"text_length": len(clean_sentence)
})
dpi = params["ocr"]["dpi"]
psm = params["ocr"]["tesseract_psm"]
if filename.endswith(".pdf"):
try:
with pdfplumber.open(io.BytesIO(content)) as pdf:
print(f"🧾 PDF opened: {filename}")
for page_number, page in enumerate(pdf.pages, start=1):
page_text = page.extract_text()
if page_text:
print(f"📄 Page {page_number} text preview: {repr(page_text[:100])}")
chunk_sentences(page_text, file_id, page_number, file.filename)
else:
print(f"⚠️ Page {page_number} has no text.")
if TESSERACT_AVAILABLE:
print(" ↪️ Falling back to OCR on this page")
img = page.to_image(resolution=dpi).original
img = preprocess_image(img)
try:
ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}')
print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}")
chunk_sentences(ocr_text, file_id, page_number, file.filename)
except pytesseract.TesseractNotFoundError:
print(f"❌ Tesseract not found at OCR time.")
else:
print(" ↪️ Skipping OCR (tesseract missing)")
except Exception as e:
print(f"❌ PDFPlumber error for {filename}: {e}")
if TESSERACT_AVAILABLE:
print("📸 OCR fallback for entire PDF...")
images = convert_from_bytes(content, dpi=dpi)
for page_number, img in enumerate(images, start=1):
img = preprocess_image(img)
try:
ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}')
print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}")
chunk_sentences(ocr_text, file_id, page_number, file.filename)
except pytesseract.TesseractNotFoundError:
print(f"❌ Tesseract not found at OCR time.")
else:
print(" ↪️ Skipping full-PDF OCR (tesseract missing)")
elif filename.endswith((".png", ".jpg", ".jpeg")):
print(f"🖼️ Image file detected: {filename}")
if TESSERACT_AVAILABLE:
image = Image.open(io.BytesIO(content))
image = preprocess_image(image)
try:
ocr_text = pytesseract.image_to_string(image, config=f'--psm {psm}')
print(f"🖨️ OCR text preview: {repr(ocr_text[:100])}")
chunk_sentences(ocr_text, file_id, page_number=1, filename=file.filename)
except pytesseract.TesseractNotFoundError:
print(f"❌ Tesseract not found at OCR time.")
else:
print(" ↪️ Skipping OCR on image (tesseract missing)")
print(f"✅ Extracted {len(chunks)} chunks from {filename}")
return chunks