File size: 2,237 Bytes
2b45a96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import os
import json
import cv2
import numpy as np
input_label_file = "D:/MyCode/Python/Model/paddleocr/total_text/test/test.txt"
image_root = "D:/MyCode/Python/Model/paddleocr/total_text/test"
output_label_file = "D:/MyCode/Python/Model/paddleocr/total_text/test/test_rec.txt"
crop_output_dir = os.path.join(image_root, "rec_crop")
os.makedirs(crop_output_dir, exist_ok=True)
with open(input_label_file, "r", encoding="utf-8") as f:
lines = f.readlines()
out_lines = []
crop_id = 0
for line in lines:
img_path_rel, anns = line.strip().split('\t')
img_path = os.path.join(image_root, img_path_rel)
anns = json.loads(anns)
if not os.path.exists(img_path):
print(f"[WARNING] Không tìm thấy ảnh: {img_path}")
continue
img = cv2.imread(img_path)
if img is None:
print(f"[WARNING] Lỗi đọc ảnh: {img_path}")
continue
height, width = img.shape[:2]
for ann in anns:
text = ann['transcription']
points = ann['points']
if text.strip().lower() == "###" or not text.strip():
continue
pts = np.array(points, dtype="float32")
x, y, w, h = cv2.boundingRect(pts.astype("int"))
# Giới hạn x, y, w, h nằm trong ảnh
x = max(0, x)
y = max(0, y)
if x + w > width or y + h > height:
print(f"[WARNING] Box vượt quá kích thước ảnh ({img_path}): x={x}, y={y}, w={w}, h={h}")
continue
cropped = img[y:y+h, x:x+w]
if cropped is None or cropped.size == 0:
print(f"[WARNING] Ảnh crop rỗng ({img_path}), bỏ qua.")
continue
crop_img_name = f"{os.path.splitext(os.path.basename(img_path))[0]}_crop_{crop_id}.jpg"
crop_img_path = os.path.join(crop_output_dir, crop_img_name)
cv2.imwrite(crop_img_path, cropped)
out_line = f"rec_crop/{crop_img_name}\t{text.strip()}"
out_lines.append(out_line)
crop_id += 1
with open(output_label_file, "w", encoding="utf-8") as f:
f.write('\n'.join(out_lines))
print(f"✅ Đã tạo {len(out_lines)} mẫu recognition tại: {output_label_file}")
|