Update app/utils/ocr.py
Browse files- app/utils/ocr.py +22 -26
app/utils/ocr.py
CHANGED
|
@@ -5,44 +5,40 @@ import re
|
|
| 5 |
import logging
|
| 6 |
import easyocr
|
| 7 |
|
|
|
|
| 8 |
ocr_reader = easyocr.Reader(['en'], gpu=False)
|
| 9 |
|
| 10 |
-
# Fungsi
|
| 11 |
def preprocess_image(image):
|
| 12 |
-
|
| 13 |
-
image = image.
|
| 14 |
-
|
| 15 |
-
# Meningkatkan kontras
|
| 16 |
-
enhancer = ImageEnhance.Contrast(image)
|
| 17 |
-
image = enhancer.enhance(2)
|
| 18 |
-
|
| 19 |
-
# Menggunakan filter untuk mengurangi noise
|
| 20 |
-
image = image.filter(ImageFilter.MedianFilter(3))
|
| 21 |
-
|
| 22 |
return image
|
| 23 |
|
|
|
|
| 24 |
def extract_text_from_image(image_file):
|
| 25 |
try:
|
| 26 |
-
#
|
| 27 |
image = Image.open(BytesIO(image_file.read())).convert("RGB")
|
| 28 |
image = preprocess_image(image)
|
| 29 |
-
|
| 30 |
image_np = np.array(image)
|
| 31 |
-
|
| 32 |
-
#
|
| 33 |
results = ocr_reader.readtext(image_np)
|
| 34 |
text = " ".join([res[1] for res in results])
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
text = re.sub(r'
|
| 39 |
-
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
| 45 |
except Exception as e:
|
| 46 |
logging.error(f"OCR error: {e}")
|
| 47 |
return ""
|
| 48 |
-
|
|
|
|
| 5 |
import logging
|
| 6 |
import easyocr
|
| 7 |
|
| 8 |
+
# Inisialisasi reader
|
| 9 |
ocr_reader = easyocr.Reader(['en'], gpu=False)
|
| 10 |
|
| 11 |
+
# Fungsi preprocessing gambar
|
| 12 |
def preprocess_image(image):
|
| 13 |
+
image = image.convert("L") # Grayscale
|
| 14 |
+
image = ImageEnhance.Contrast(image).enhance(2) # Kontras
|
| 15 |
+
image = image.filter(ImageFilter.MedianFilter(3)) # Filter noise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
return image
|
| 17 |
|
| 18 |
+
# Fungsi utama ekstraksi teks dari gambar
|
| 19 |
def extract_text_from_image(image_file):
|
| 20 |
try:
|
| 21 |
+
# Buka dan preprocess gambar
|
| 22 |
image = Image.open(BytesIO(image_file.read())).convert("RGB")
|
| 23 |
image = preprocess_image(image)
|
|
|
|
| 24 |
image_np = np.array(image)
|
| 25 |
+
|
| 26 |
+
# OCR
|
| 27 |
results = ocr_reader.readtext(image_np)
|
| 28 |
text = " ".join([res[1] for res in results])
|
| 29 |
+
logging.info(f"OCR Raw Text: {text}")
|
| 30 |
+
|
| 31 |
+
# Hapus label pembuka seperti 'Ingredients:', 'Komposisi:', dsb.
|
| 32 |
+
text = re.sub(r'\b(Ingredients|Komposisi|Composition|Bahan|Daftar Bahan)\s*[:\-]?\s*', '', text, flags=re.IGNORECASE)
|
| 33 |
+
|
| 34 |
+
# Hapus karakter tidak relevan (non-alfanumerik selain titik, koma, spasi, dan dash)
|
| 35 |
+
text = re.sub(r'[^A-Za-z0-9,\.\s\-]', '', text)
|
| 36 |
+
|
| 37 |
+
# (Opsional) Kamu bisa hapus kata-kata yang bukan ingredient umum (jika perlu)
|
| 38 |
+
# Tapi lebih baik ini dilakukan di tahap `extract_ingredients`, bukan di sini
|
| 39 |
+
|
| 40 |
+
return text.strip()
|
| 41 |
+
|
| 42 |
except Exception as e:
|
| 43 |
logging.error(f"OCR error: {e}")
|
| 44 |
return ""
|
|
|