Spaces:

adityasarade
/

wasserstoff

Sleeping

App Files Files Community

wasserstoff / app /services /file_handler.py

adityasarade

Downloading nltk during runtime

3ee1c38 9 months ago

raw

history blame contribute delete

5.25 kB

	from PIL import Image
	import pytesseract
	import pdfplumber
	import io
	from fastapi import UploadFile
	from pdf2image import convert_from_bytes
	import uuid
	import nltk
	from nltk.tokenize import sent_tokenize
	from app.config import params
	import shutil
	import os

	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	if os.environ.get("ENV") != "production": # safe for local dev
	nltk.download("punkt")

	TESSERACT_AVAILABLE = shutil.which("tesseract") is not None
	if not TESSERACT_AVAILABLE:
	print("⚠️ Warning: Tesseract not found in PATH — OCR will be skipped.")

	def preprocess_image(image: Image.Image) -> Image.Image:
	"""
	Preprocess PIL Image for better OCR accuracy: convert to grayscale and apply threshold.
	"""
	gray = image.convert("L") # grayscale
	thresh = params["ocr"]["threshold"]
	bw = gray.point(lambda x: 0 if x < 140 else 255, '1') # simple binary threshold
	return bw

	async def extract_chunks_from_file(file: UploadFile) -> list[dict]:
	"""
	Extract sentence-level chunks with metadata from PDF or image file.
	Returns a list of dicts: {'doc_id', 'filename', 'page', 'sentence', 'text'}.
	"""
	content = await file.read()
	filename = file.filename.lower()
	file_id = f"DOC{str(uuid.uuid4())[:5].upper()}" # Short custom doc ID
	chunks = []

	def chunk_sentences(text, doc_id, page_number, filename):
	sentences = sent_tokenize(text)
	for sent_number, sentence in enumerate(sentences, start=1):
	clean_sentence = sentence.strip().replace("\n", " ")
	if clean_sentence:
	chunks.append({
	"doc_id": doc_id,
	"filename": filename,
	"page": page_number,
	"sentence": sent_number,
	"text": clean_sentence,
	"text_length": len(clean_sentence)
	})

	dpi = params["ocr"]["dpi"]
	psm = params["ocr"]["tesseract_psm"]

	if filename.endswith(".pdf"):
	try:
	with pdfplumber.open(io.BytesIO(content)) as pdf:
	print(f"🧾 PDF opened: {filename}")
	for page_number, page in enumerate(pdf.pages, start=1):
	page_text = page.extract_text()
	if page_text:
	print(f"📄 Page {page_number} text preview: {repr(page_text[:100])}")
	chunk_sentences(page_text, file_id, page_number, file.filename)
	else:
	print(f"⚠️ Page {page_number} has no text.")
	if TESSERACT_AVAILABLE:
	print(" ↪️ Falling back to OCR on this page")
	img = page.to_image(resolution=dpi).original
	img = preprocess_image(img)
	try:
	ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}')
	print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}")
	chunk_sentences(ocr_text, file_id, page_number, file.filename)
	except pytesseract.TesseractNotFoundError:
	print(f"❌ Tesseract not found at OCR time.")
	else:
	print(" ↪️ Skipping OCR (tesseract missing)")
	except Exception as e:
	print(f"❌ PDFPlumber error for {filename}: {e}")
	if TESSERACT_AVAILABLE:
	print("📸 OCR fallback for entire PDF...")
	images = convert_from_bytes(content, dpi=dpi)
	for page_number, img in enumerate(images, start=1):
	img = preprocess_image(img)
	try:
	ocr_text = pytesseract.image_to_string(img, config=f'--psm {psm}')
	print(f"🖼️ OCR text from page {page_number}: {repr(ocr_text[:100])}")
	chunk_sentences(ocr_text, file_id, page_number, file.filename)
	except pytesseract.TesseractNotFoundError:
	print(f"❌ Tesseract not found at OCR time.")
	else:
	print(" ↪️ Skipping full-PDF OCR (tesseract missing)")

	elif filename.endswith((".png", ".jpg", ".jpeg")):
	print(f"🖼️ Image file detected: {filename}")
	if TESSERACT_AVAILABLE:
	image = Image.open(io.BytesIO(content))
	image = preprocess_image(image)
	try:
	ocr_text = pytesseract.image_to_string(image, config=f'--psm {psm}')
	print(f"🖨️ OCR text preview: {repr(ocr_text[:100])}")
	chunk_sentences(ocr_text, file_id, page_number=1, filename=file.filename)
	except pytesseract.TesseractNotFoundError:
	print(f"❌ Tesseract not found at OCR time.")
	else:
	print(" ↪️ Skipping OCR on image (tesseract missing)")

	print(f"✅ Extracted {len(chunks)} chunks from {filename}")
	return chunks