import cv2
import pytesseract
from pytesseract import Output
import numpy as np
import gradio as gr
from PIL import Image
from collections import defaultdict
import re

def ocr_scb_slip_text(image):
    # แปลงจาก PIL → OpenCV
    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    
    # Preprocessing เพื่อให้ OCR อ่านแม่น
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )
    kernel = np.ones((2,2), np.uint8)
    gray = cv2.dilate(gray, kernel, iterations=1)
    gray = cv2.erode(gray, kernel, iterations=1)
    
    # Tesseract config
    custom_config = r'-l eng+tha --oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'

    # OCR
    data = pytesseract.image_to_data(gray, config=custom_config, output_type=Output.DICT)
    totalBox = len(data['text'])
    lines = defaultdict(list)

    # จัดข้อความเป็นบรรทัด
    for i in range(totalBox):
        if int(data['conf'][i]) > 0:
            text = data['text'][i].strip()
            if text != "":
                line_id = (data['block_num'][i], data['par_num'][i], data['line_num'][i])
                lines[line_id].append(text)

    # รวมข้อความทั้งหมดเป็นบรรทัดเดียว
    extracted_texts = [" ".join(words) for _, words in sorted(lines.items())]
    final_text = "\n".join(extracted_texts)

    # แก้เว้นวรรคภาษาไทย
    final_text = re.sub(r'([\u0E00-\u0E7F])\s+([\u0E00-\u0E7F])', r'\1\2', final_text)

    return final_text

# Gradio UI
demo = gr.Interface(
    fn=ocr_scb_slip_text,
    inputs=gr.Image(type="pil", label="อัปโหลดสลิป SCB"),
    outputs=gr.Textbox(label="ข้อความ OCR รวมทุกบรรทัด"),
    title="OCR สลิป SCB (ข้อความรวม)",
    description="OCR สลิป SCB รวมโลโก้ SCB และข้อความทั้งหมดเป็นข้อความเดียว"
)

if __name__ == "__main__":
    demo.launch()