skincare / app /utils /ocr.py
Maulidaaa's picture
Update app/utils/ocr.py
8788edb verified
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
from io import BytesIO
import re
import logging
import easyocr
# Inisialisasi reader
ocr_reader = easyocr.Reader(['en'], gpu=False)
# Fungsi preprocessing gambar
def preprocess_image(image):
image = image.convert("L") # Grayscale
image = ImageEnhance.Contrast(image).enhance(2) # Kontras
image = image.filter(ImageFilter.MedianFilter(3)) # Filter noise
return image
# Fungsi utama ekstraksi teks dari gambar
def extract_text_from_image(image_file):
try:
# Buka dan preprocess gambar
image = Image.open(BytesIO(image_file.read())).convert("RGB")
image = preprocess_image(image)
image_np = np.array(image)
# OCR
results = ocr_reader.readtext(image_np)
text = " ".join([res[1] for res in results])
logging.info(f"OCR Raw Text: {text}")
# Hapus label pembuka seperti 'Ingredients:', 'Komposisi:', dsb.
text = re.sub(r'\b(Ingredients|Komposisi|Composition|Bahan|Daftar Bahan)\s*[:\-]?\s*', '', text, flags=re.IGNORECASE)
# Hapus karakter tidak relevan (non-alfanumerik selain titik, koma, spasi, dan dash)
text = re.sub(r'[^A-Za-z0-9,\.\s\-]', '', text)
# (Opsional) Kamu bisa hapus kata-kata yang bukan ingredient umum (jika perlu)
# Tapi lebih baik ini dilakukan di tahap `extract_ingredients`, bukan di sini
return text.strip()
except Exception as e:
logging.error(f"OCR error: {e}")
return ""