|
|
import numpy as np |
|
|
from PIL import Image, ImageEnhance, ImageFilter |
|
|
from io import BytesIO |
|
|
import re |
|
|
import logging |
|
|
import easyocr |
|
|
|
|
|
|
|
|
ocr_reader = easyocr.Reader(['en'], gpu=False) |
|
|
|
|
|
|
|
|
def preprocess_image(image): |
|
|
image = image.convert("L") |
|
|
image = ImageEnhance.Contrast(image).enhance(2) |
|
|
image = image.filter(ImageFilter.MedianFilter(3)) |
|
|
return image |
|
|
|
|
|
|
|
|
def extract_text_from_image(image_file): |
|
|
try: |
|
|
|
|
|
image = Image.open(BytesIO(image_file.read())).convert("RGB") |
|
|
image = preprocess_image(image) |
|
|
image_np = np.array(image) |
|
|
|
|
|
|
|
|
results = ocr_reader.readtext(image_np) |
|
|
text = " ".join([res[1] for res in results]) |
|
|
logging.info(f"OCR Raw Text: {text}") |
|
|
|
|
|
|
|
|
text = re.sub(r'\b(Ingredients|Komposisi|Composition|Bahan|Daftar Bahan)\s*[:\-]?\s*', '', text, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
text = re.sub(r'[^A-Za-z0-9,\.\s\-]', '', text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return text.strip() |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"OCR error: {e}") |
|
|
return "" |
|
|
|