Spaces:

sompimnara
/

cvproject

Sleeping

File size: 8,454 Bytes

a1916d2

import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import re
import numpy as np
import cv2  # ใช้แค่ตอนรับภาพเข้า (Interface) แต่ใน Process เราจะเขียนเอง

# === 1. โหลดโมเดล Donut (Transformer) ===
print("⏳ กำลังโหลดโมเดล Donut Transformer...")
MODEL_NAME = "naver-clova-ix/donut-base-finetuned-cord-v2"

try:
    # บังคับใช้ CPU เพื่อความเสถียร 100% บน Mac
    device = "cpu"
    
    processor = DonutProcessor.from_pretrained(MODEL_NAME)
    model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
    model.to(device)
    
    print(f"✅ โหลด Donut สำเร็จ! (Running on {device})")
except Exception as e:
    print(f"❌ โหลดโมเดลไม่สำเร็จ: {e}")
    model = None
    processor = None


# Manual Computer Vision (เขียนเอง)


def manual_bgr_to_rgb(image):
    """
    [Manual CV 1] แปลงสี BGR -> RGB ด้วยการจัดการ Array เอง
    แทนการใช้ cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    """
    # ตรวจสอบว่าเป็นภาพขาวดำ (2 มิติ) หรือไม่
    if len(image.shape) == 2:
        h, w = image.shape
        # สร้างภาพสี 3 ช่อง (R, G, B) โดยเอาค่าเดิมไปใส่ทุกช่อง
        rgb = np.zeros((h, w, 3), dtype=np.uint8)
        rgb[:,:,0] = image
        rgb[:,:,1] = image
        rgb[:,:,2] = image
        return rgb
    
    # ถ้าเป็นภาพสี ให้สร้างภาพใหม่แล้วสลับช่องสี
    # Channel 0 ของใหม่ (R) = Channel 2 ของเดิม (R)
    # Channel 1 ของใหม่ (G) = Channel 1 ของเดิม (G)
    # Channel 2 ของใหม่ (B) = Channel 0 ของเดิม (B)
    rgb_image = image.copy()
    rgb_image[:, :, 0] = image[:, :, 2] 
    rgb_image[:, :, 2] = image[:, :, 0]
    
    return rgb_image

def manual_contrast_stretch(image):
    """
    [Manual CV 2] ปรับความคมชัดแบบ Min-Max Normalization
    สูตร: New_Pixel = (Current - Min) * (255 / (Max - Min))
    แทนการใช้ cv2.normalize
    """
    # แปลงเป็น float เพื่อป้องกัน overflow ตอนคำนวณ
    img_float = image.astype(np.float32)
    
    # หาค่ามืดสุดและสว่างสุดในภาพ
    min_val = np.min(img_float)
    max_val = np.max(img_float)
    
    if max_val - min_val == 0:
        return image # ถ้าภาพสีเท่ากันทั้งภาพ ไม่ต้องทำอะไร
        
    # เข้าสูตรคณิตศาสตร์
    stretched = (img_float - min_val) * (255.0 / (max_val - min_val))
    
    # แปลงกลับเป็นจำนวนเต็ม 0-255
    return stretched.astype(np.uint8)

def manual_convolution(image, kernel):
    """
    ฟังก์ชันช่วยคำนวณ Convolution (ใช้ใน Sharpen)
    แทนการใช้ cv2.filter2D
    """
    image_h, image_w, channels = image.shape
    kernel_h, kernel_w = kernel.shape
    pad = kernel_h // 2
    
    # สร้างภาพที่มีขอบ (Padding)
    padded_image = np.pad(image, ((pad, pad), (pad, pad), (0, 0)), mode='constant')
    output = np.zeros_like(image)
    
    # วนลูปคำนวณแต่ละช่องสี (RGB)
    for c in range(channels):
        # ใช้เทคนิค Slicing ของ Numpy เพื่อความเร็ว (แทน Loop ซ้อนที่ช้ามาก)
        for y in range(image_h):
            for x in range(image_w):
                # ตัดพื้นที่ภาพให้เท่าขนาด Kernel
                roi = padded_image[y:y+kernel_h, x:x+kernel_w, c]
                # คูณกันแล้วบวก (Dot Product)
                output[y, x, c] = np.sum(roi * kernel)
                
    return output

def manual_sharpen(image):
    """
    [Manual CV 3] เพิ่มความคมชัดตัวหนังสือ
    ด้วยการสร้าง Kernel Matrix แล้วทำ Convolution เอง
    """
    # สร้าง Kernel แบบ Sharpen (เน้นจุดกึ่งกลาง ลบขอบข้างๆ)
    kernel = np.array([[ 0, -1,  0],
                       [-1,  5, -1],
                       [ 0, -1,  0]])
                       
    # เรียกใช้ฟังก์ชัน Convolution ที่เขียนเองด้านบน
    sharpened = manual_convolution(image, kernel)
    
    # ตัดค่าส่วนเกิน (Clip) ให้อยู่ระหว่าง 0-255
    sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
    return sharpened

def json_to_string(data, level=0):
    """ จัดรูปแบบ JSON ให้อ่านง่าย """
    text_output = ""
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                text_output += f"\n{'  '*level}[{key}]:\n" + json_to_string(value, level + 1)
            else:
                text_output += f"{'  '*level}- {key}: {value}\n"
    elif isinstance(data, list):
        for item in data:
            text_output += json_to_string(item, level)
    return text_output

# =================================================================

def extract_text(image):
    if model is None: return "Error: Model not loaded."

    try:
        # --- PHASE 1: Manual Preprocessing (คอมวิชั่นแบบเขียนเอง) ---
        
        # 1. แปลงสี BGR เป็น RGB (Manual)
        img_rgb = manual_bgr_to_rgb(image)
        
        # 2. ปรับแสงให้ชัดขึ้น (Manual Contrast)
        # ช่วยให้ AI อ่านใบเสร็จที่ซีดจางได้ดีขึ้น
        img_contrast = manual_contrast_stretch(img_rgb)
        
        # 3. ทำให้ตัวหนังสือคมขึ้น (Manual Sharpening)
        # ขั้นตอนนี้สำคัญมากสำหรับการทำ OCR
        img_sharpened = manual_sharpen(img_contrast)
        
        # แปลงเป็น PIL Image เพื่อส่งต่อให้ AI
        pil_image = Image.fromarray(img_sharpened)

        # --- PHASE 2: AI Processing (Donut Transformer) ---
        
        # เตรียม Input
        pixel_values = processor(pil_image, return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)

        # สร้าง Prompt
        task_prompt = "<s_cord-v2>"
        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
        decoder_input_ids = decoder_input_ids.to(device)

        # สั่งให้ AI อ่าน
        with torch.no_grad():
            outputs = model.generate(
                pixel_values,
                decoder_input_ids=decoder_input_ids,
                max_length=768,
                early_stopping=True,
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id,
                use_cache=True,
                num_beams=1,
                bad_words_ids=[[processor.tokenizer.unk_token_id]],
                return_dict_in_generate=True,
            )

        # แปลงผลลัพธ์
        sequence = processor.batch_decode(outputs.sequences)[0]
        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
        
        json_output = processor.token2json(sequence)
        final_text = json_to_string(json_output)
        
        return final_text

    except Exception as e:
        return f"Error: {str(e)}"

if __name__ == "__main__":
    pass