Spaces:
Sleeping
Sleeping
| import pytesseract | |
| from PIL import Image | |
| import cv2 | |
| import numpy as np | |
| import re | |
| import os | |
| import glob | |
| import io | |
| class OcrPriceExtractor: | |
| def __init__(self, lang='eng'): | |
| self.lang = lang | |
| self.custom_config = r'--oem 3 --psm 6' | |
| def extract_price(self, text): | |
| pattern = r'Rp\s*\d+(?:[.,]\d+)*' | |
| prices = re.findall(pattern, text) | |
| return prices | |
| def crop_image(self, image): | |
| height = image.shape[0] | |
| crop_top = int(height * 0.2) # 20% from top | |
| crop_bottom = int(height * 0.2) # 20% from bottom | |
| # Crop the image | |
| cropped = image[crop_top:height-crop_bottom, :] | |
| # cv2.imshow("cropped", cropped) | |
| # cv2.waitKey(0) | |
| # cv2.destroyAllWindows() | |
| return cropped | |
| def extract_text_from_bytes(self, image_bytes): | |
| try: | |
| # Convert bytes to numpy array | |
| nparr = np.frombuffer(image_bytes, np.uint8) | |
| image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) | |
| if image is None: | |
| raise Exception("Could not decode image bytes") | |
| cropped_image = self.crop_image(image) | |
| pil_image = Image.fromarray(cropped_image) | |
| # Extract text using pytesseract | |
| text = pytesseract.image_to_string(pil_image, config=self.custom_config, lang=self.lang) | |
| text = text.strip() | |
| # Extract prices from the text | |
| prices = self.extract_price(text) | |
| return text, prices | |
| except Exception as e: | |
| print(f"Error during OCR: {str(e)}") | |
| return None, None | |
| def extract_text_from_bytesio(self, bytes_io): | |
| try: | |
| # Convert BytesIO to bytes | |
| image_bytes = bytes_io.getvalue() | |
| return self.extract_text_from_bytes(image_bytes) | |
| except Exception as e: | |
| print(f"Error during OCR from BytesIO: {str(e)}") | |
| return None, None | |
| def extract_text(self, image_input): | |
| try: | |
| if isinstance(image_input, io.BytesIO): | |
| return self.extract_text_from_bytesio(image_input) | |
| elif isinstance(image_input, bytes): | |
| return self.extract_text_from_bytes(image_input) | |
| image = cv2.imread(image_input) | |
| if image is None: | |
| raise Exception("Could not read image") | |
| cropped_image = self.crop_image(image) | |
| pil_image = Image.fromarray(cropped_image) | |
| text = pytesseract.image_to_string(pil_image, config=self.custom_config, lang=self.lang) | |
| text = text.strip() | |
| prices = self.extract_price(text) | |
| return text, prices | |
| except Exception as e: | |
| print(f"Error during OCR: {str(e)}") | |
| return None, None | |
| def test_all_images(self, directory="."): | |
| image_extensions = ('*.jpg', '*.jpeg', '*.png', '*.bmp') | |
| image_files = [] | |
| for ext in image_extensions: | |
| image_files.extend(glob.glob(os.path.join(directory, ext))) | |
| if not image_files: | |
| print("No image files found in directory!\n") | |
| return | |
| for image_path in image_files: | |
| print(f"\nProcessing: {image_path}") | |
| print("-" * 50) | |
| text, prices = self.extract_text(image_path) | |
| self._print_results(text, prices) | |
| with open(image_path, 'rb') as f: | |
| bytes_io = io.BytesIO(f.read()) | |
| text, prices = self.extract_text(bytes_io) | |
| self._print_results(text, prices) | |
| print("-" * 50) | |
| def _print_results(self, text, prices): | |
| if text: | |
| if prices: | |
| print("\nTest Passed. Found Prices:") | |
| for price in prices: | |
| print(price) | |
| else: | |
| print("\nNo prices found in this image.") | |
| else: | |
| print("No text was extracted or an error occurred.") | |
| if __name__ == "__main__": | |
| import sys | |
| ocr = OcrPriceExtractor() | |
| if len(sys.argv) > 1 and sys.argv[1] == "--all": | |
| ocr.test_all_images() | |
| else: | |
| image_path = "3.jpg" | |
| with open(image_path, 'rb') as f: | |
| bytes_io = io.BytesIO(f.read()) | |
| text, prices = ocr.extract_text(bytes_io) | |
| if text: | |
| print("\nFound Prices:") | |
| for price in prices: | |
| print(price) | |
| else: | |
| print("No text was extracted or an error occurred.") | |