Sarathrsk03's picture
V 0.1
d904dd8
"""
Uses OCR to extract the raw text from a given image, optimized for receipts.
Author: Sarath Rajan S
Date: 26-01-2026
"""
import pytesseract
from typing import Optional
from PIL import Image, ImageOps, ImageFilter
def extract_raw_text(image_path: str) -> Optional[str]:
"""
Uses pytesseract to extract the raw text from the image.
Includes basic preprocessing to improve OCR accuracy for receipts.
"""
try:
# Open the image using PIL
with Image.open(image_path) as img:
# 1. Convert to grayscale to reduce noise
img = ImageOps.grayscale(img)
# 2. Enhance contrast and sharpen for better character definition
img = img.filter(ImageFilter.SHARPEN)
# 3. Use custom Tesseract config
# --psm 4: Assume a single column of text of variable sizes (common in receipts)
# --oem 3: Default, based on what is available
custom_config = r'--oem 3 --psm 4'
text = pytesseract.image_to_string(img, config=custom_config)
return str(text).strip() if text else None
except Exception as e:
print(f"Error extracting text from {image_path}: {e}")
return None
def extract_text_from_receipt(image_path: str) -> Optional[str]:
"""
Alias for extract_raw_text, specifically intended for receipt processing.
Ensures the text is cleaned up before returning.
"""
text = extract_raw_text(image_path)
if text:
# Basic cleanup: remove excessive empty lines
lines = [line.strip() for line in text.split('\n') if line.strip()]
return '\n'.join(lines)
return None
if __name__ == "__main__":
print(extract_text_from_receipt("/Users/sarathrajan/Desktop/catapultSplit/utils/architecture/receipts/receipt-1.jpeg"))