Spaces:

Sarathrsk03
/

Catapult-Splitter

Sleeping

App Files Files Community

Catapult-Splitter / tools /ocr.py

Sarathrsk03

V 0.1

d904dd8 3 months ago

raw

history blame contribute delete

1.87 kB

	"""
	Uses OCR to extract the raw text from a given image, optimized for receipts.

	Author: Sarath Rajan S
	Date: 26-01-2026
	"""

	import pytesseract
	from typing import Optional
	from PIL import Image, ImageOps, ImageFilter

	def extract_raw_text(image_path: str) -> Optional[str]:
	"""
	Uses pytesseract to extract the raw text from the image.
	Includes basic preprocessing to improve OCR accuracy for receipts.
	"""
	try:
	# Open the image using PIL
	with Image.open(image_path) as img:
	# 1. Convert to grayscale to reduce noise
	img = ImageOps.grayscale(img)

	# 2. Enhance contrast and sharpen for better character definition
	img = img.filter(ImageFilter.SHARPEN)

	# 3. Use custom Tesseract config
	# --psm 4: Assume a single column of text of variable sizes (common in receipts)
	# --oem 3: Default, based on what is available
	custom_config = r'--oem 3 --psm 4'

	text = pytesseract.image_to_string(img, config=custom_config)
	return str(text).strip() if text else None

	except Exception as e:
	print(f"Error extracting text from {image_path}: {e}")
	return None

	def extract_text_from_receipt(image_path: str) -> Optional[str]:
	"""
	Alias for extract_raw_text, specifically intended for receipt processing.
	Ensures the text is cleaned up before returning.
	"""
	text = extract_raw_text(image_path)
	if text:
	# Basic cleanup: remove excessive empty lines
	lines = [line.strip() for line in text.split('\n') if line.strip()]
	return '\n'.join(lines)
	return None


	if __name__ == "__main__":
	print(extract_text_from_receipt("/Users/sarathrajan/Desktop/catapultSplit/utils/architecture/receipts/receipt-1.jpeg"))