Spaces:
Sleeping
Sleeping
| """ | |
| Uses OCR to extract the raw text from a given image, optimized for receipts. | |
| Author: Sarath Rajan S | |
| Date: 26-01-2026 | |
| """ | |
| import pytesseract | |
| from typing import Optional | |
| from PIL import Image, ImageOps, ImageFilter | |
| def extract_raw_text(image_path: str) -> Optional[str]: | |
| """ | |
| Uses pytesseract to extract the raw text from the image. | |
| Includes basic preprocessing to improve OCR accuracy for receipts. | |
| """ | |
| try: | |
| # Open the image using PIL | |
| with Image.open(image_path) as img: | |
| # 1. Convert to grayscale to reduce noise | |
| img = ImageOps.grayscale(img) | |
| # 2. Enhance contrast and sharpen for better character definition | |
| img = img.filter(ImageFilter.SHARPEN) | |
| # 3. Use custom Tesseract config | |
| # --psm 4: Assume a single column of text of variable sizes (common in receipts) | |
| # --oem 3: Default, based on what is available | |
| custom_config = r'--oem 3 --psm 4' | |
| text = pytesseract.image_to_string(img, config=custom_config) | |
| return str(text).strip() if text else None | |
| except Exception as e: | |
| print(f"Error extracting text from {image_path}: {e}") | |
| return None | |
| def extract_text_from_receipt(image_path: str) -> Optional[str]: | |
| """ | |
| Alias for extract_raw_text, specifically intended for receipt processing. | |
| Ensures the text is cleaned up before returning. | |
| """ | |
| text = extract_raw_text(image_path) | |
| if text: | |
| # Basic cleanup: remove excessive empty lines | |
| lines = [line.strip() for line in text.split('\n') if line.strip()] | |
| return '\n'.join(lines) | |
| return None | |
| if __name__ == "__main__": | |
| print(extract_text_from_receipt("/Users/sarathrajan/Desktop/catapultSplit/utils/architecture/receipts/receipt-1.jpeg")) |