Spaces:

sompimnara
/

cvproject

Sleeping

App Files Files Community

cvproject / ocr.py

sompimnara

Upload 7 files

a1916d2 verified about 2 months ago

raw

history blame contribute delete

8.45 kB

	import torch
	from transformers import DonutProcessor, VisionEncoderDecoderModel
	from PIL import Image
	import re
	import numpy as np
	import cv2 # ใช้แค่ตอนรับภาพเข้า (Interface) แต่ใน Process เราจะเขียนเอง

	# === 1. โหลดโมเดล Donut (Transformer) ===
	print("⏳ กำลังโหลดโมเดล Donut Transformer...")
	MODEL_NAME = "naver-clova-ix/donut-base-finetuned-cord-v2"

	try:
	# บังคับใช้ CPU เพื่อความเสถียร 100% บน Mac
	device = "cpu"

	processor = DonutProcessor.from_pretrained(MODEL_NAME)
	model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
	model.to(device)

	print(f"✅ โหลด Donut สำเร็จ! (Running on {device})")
	except Exception as e:
	print(f"❌ โหลดโมเดลไม่สำเร็จ: {e}")
	model = None
	processor = None


	# Manual Computer Vision (เขียนเอง)


	def manual_bgr_to_rgb(image):
	"""
	[Manual CV 1] แปลงสี BGR -> RGB ด้วยการจัดการ Array เอง
	แทนการใช้ cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	"""
	# ตรวจสอบว่าเป็นภาพขาวดำ (2 มิติ) หรือไม่
	if len(image.shape) == 2:
	h, w = image.shape
	# สร้างภาพสี 3 ช่อง (R, G, B) โดยเอาค่าเดิมไปใส่ทุกช่อง
	rgb = np.zeros((h, w, 3), dtype=np.uint8)
	rgb[:,:,0] = image
	rgb[:,:,1] = image
	rgb[:,:,2] = image
	return rgb

	# ถ้าเป็นภาพสี ให้สร้างภาพใหม่แล้วสลับช่องสี
	# Channel 0 ของใหม่ (R) = Channel 2 ของเดิม (R)
	# Channel 1 ของใหม่ (G) = Channel 1 ของเดิม (G)
	# Channel 2 ของใหม่ (B) = Channel 0 ของเดิม (B)
	rgb_image = image.copy()
	rgb_image[:, :, 0] = image[:, :, 2]
	rgb_image[:, :, 2] = image[:, :, 0]

	return rgb_image

	def manual_contrast_stretch(image):
	"""
	[Manual CV 2] ปรับความคมชัดแบบ Min-Max Normalization
	สูตร: New_Pixel = (Current - Min) * (255 / (Max - Min))
	แทนการใช้ cv2.normalize
	"""
	# แปลงเป็น float เพื่อป้องกัน overflow ตอนคำนวณ
	img_float = image.astype(np.float32)

	# หาค่ามืดสุดและสว่างสุดในภาพ
	min_val = np.min(img_float)
	max_val = np.max(img_float)

	if max_val - min_val == 0:
	return image # ถ้าภาพสีเท่ากันทั้งภาพ ไม่ต้องทำอะไร

	# เข้าสูตรคณิตศาสตร์
	stretched = (img_float - min_val) * (255.0 / (max_val - min_val))

	# แปลงกลับเป็นจำนวนเต็ม 0-255
	return stretched.astype(np.uint8)

	def manual_convolution(image, kernel):
	"""
	ฟังก์ชันช่วยคำนวณ Convolution (ใช้ใน Sharpen)
	แทนการใช้ cv2.filter2D
	"""
	image_h, image_w, channels = image.shape
	kernel_h, kernel_w = kernel.shape
	pad = kernel_h // 2

	# สร้างภาพที่มีขอบ (Padding)
	padded_image = np.pad(image, ((pad, pad), (pad, pad), (0, 0)), mode='constant')
	output = np.zeros_like(image)

	# วนลูปคำนวณแต่ละช่องสี (RGB)
	for c in range(channels):
	# ใช้เทคนิค Slicing ของ Numpy เพื่อความเร็ว (แทน Loop ซ้อนที่ช้ามาก)
	for y in range(image_h):
	for x in range(image_w):
	# ตัดพื้นที่ภาพให้เท่าขนาด Kernel
	roi = padded_image[y:y+kernel_h, x:x+kernel_w, c]
	# คูณกันแล้วบวก (Dot Product)
	output[y, x, c] = np.sum(roi * kernel)

	return output

	def manual_sharpen(image):
	"""
	[Manual CV 3] เพิ่มความคมชัดตัวหนังสือ
	ด้วยการสร้าง Kernel Matrix แล้วทำ Convolution เอง
	"""
	# สร้าง Kernel แบบ Sharpen (เน้นจุดกึ่งกลาง ลบขอบข้างๆ)
	kernel = np.array([[ 0, -1, 0],
	[-1, 5, -1],
	[ 0, -1, 0]])

	# เรียกใช้ฟังก์ชัน Convolution ที่เขียนเองด้านบน
	sharpened = manual_convolution(image, kernel)

	# ตัดค่าส่วนเกิน (Clip) ให้อยู่ระหว่าง 0-255
	sharpened = np.clip(sharpened, 0, 255).astype(np.uint8)
	return sharpened

	def json_to_string(data, level=0):
	""" จัดรูปแบบ JSON ให้อ่านง่าย """
	text_output = ""
	if isinstance(data, dict):
	for key, value in data.items():
	if isinstance(value, (dict, list)):
	text_output += f"\n{' '*level}[{key}]:\n" + json_to_string(value, level + 1)
	else:
	text_output += f"{' '*level}- {key}: {value}\n"
	elif isinstance(data, list):
	for item in data:
	text_output += json_to_string(item, level)
	return text_output

	# =================================================================

	def extract_text(image):
	if model is None: return "Error: Model not loaded."

	try:
	# --- PHASE 1: Manual Preprocessing (คอมวิชั่นแบบเขียนเอง) ---

	# 1. แปลงสี BGR เป็น RGB (Manual)
	img_rgb = manual_bgr_to_rgb(image)

	# 2. ปรับแสงให้ชัดขึ้น (Manual Contrast)
	# ช่วยให้ AI อ่านใบเสร็จที่ซีดจางได้ดีขึ้น
	img_contrast = manual_contrast_stretch(img_rgb)

	# 3. ทำให้ตัวหนังสือคมขึ้น (Manual Sharpening)
	# ขั้นตอนนี้สำคัญมากสำหรับการทำ OCR
	img_sharpened = manual_sharpen(img_contrast)

	# แปลงเป็น PIL Image เพื่อส่งต่อให้ AI
	pil_image = Image.fromarray(img_sharpened)

	# --- PHASE 2: AI Processing (Donut Transformer) ---

	# เตรียม Input
	pixel_values = processor(pil_image, return_tensors="pt").pixel_values
	pixel_values = pixel_values.to(device)

	# สร้าง Prompt
	task_prompt = "<s_cord-v2>"
	decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
	decoder_input_ids = decoder_input_ids.to(device)

	# สั่งให้ AI อ่าน
	with torch.no_grad():
	outputs = model.generate(
	pixel_values,
	decoder_input_ids=decoder_input_ids,
	max_length=768,
	early_stopping=True,
	pad_token_id=processor.tokenizer.pad_token_id,
	eos_token_id=processor.tokenizer.eos_token_id,
	use_cache=True,
	num_beams=1,
	bad_words_ids=[[processor.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	)

	# แปลงผลลัพธ์
	sequence = processor.batch_decode(outputs.sequences)[0]
	sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
	sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()

	json_output = processor.token2json(sequence)
	final_text = json_to_string(json_output)

	return final_text

	except Exception as e:
	return f"Error: {str(e)}"

	if __name__ == "__main__":
	pass