Upload 3 files

Browse files

Files changed (3) hide show

PP-OCRv5_mobile_rec.yml +140 -0
convert_format.py +17 -0
convert_format_test.py +67 -0

PP-OCRv5_mobile_rec.yml ADDED Viewed

	@@ -0,0 +1,140 @@

+Global:
+  model_name: PP-OCRv5_mobile_rec # To use static model for inference.
+  debug: false
+  use_gpu: true
+  epoch_num: 75
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/PP-OCRv5_mobile_rec
+  save_epoch_step: 10
+  eval_batch_step: [0, 2000]
+  cal_metric_during_train: true
+  pretrained_model: D:/MyCode/Python/Model/paddleocr/rec_mv3_none_bilstm_ctc_v2.0_train/best_accuracy.pdparams
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: false
+  infer_img: doc/imgs_words/ch/word_1.jpg
+  character_dict_path: ./ppocr/utils/dict/latin_dict.txt
+  max_text_length: &max_text_length 25
+  infer_mode: false
+  use_space_char: true
+  distributed: true
+  save_res_path: ./output/rec/predicts_ppocrv5.txt
+  d2s_train_image_shape: [3, 48, 320]
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    factor: 3.0e-05
+Architecture:
+  model_type: rec
+  algorithm: SVTR_LCNet
+  Transform:
+  Backbone:
+    name: PPLCNetV3
+    scale: 0.95
+  Head:
+    name: MultiHead
+    head_list:
+      - CTCHead:
+          Neck:
+            name: svtr
+            dims: 120
+            depth: 2
+            hidden_dims: 120
+            kernel_size: [1, 3]
+            use_guide: True
+          Head:
+            fc_decay: 0.00001
+      - NRTRHead:
+          nrtr_dim: 384
+          max_text_length: *max_text_length
+Loss:
+  name: MultiLoss
+  loss_config_list:
+    - CTCLoss:
+    - NRTRLoss:
+PostProcess:
+  name: CTCLabelDecode
+Metric:
+  name: RecMetric
+  main_indicator: acc
+Train:
+  dataset:
+    name: MultiScaleDataSet
+    ds_width: false
+    data_dir: D:/MyCode/Python/Model/paddleocr/total_text/train
+    ext_op_transform_idx: 1
+    label_file_list:
+    - D:/MyCode/Python/Model/paddleocr/total_text/train/train_rec.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecConAug:
+        prob: 0.5
+        ext_data_num: 2
+        image_shape: [48, 320, 3]
+        max_text_length: *max_text_length
+    - RecAug:
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  sampler:
+    name: MultiScaleSampler
+    scales: [[320, 32], [320, 48], [320, 64]]
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [8, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: true
+    batch_size_per_card: *bs
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: D:/MyCode/Python/Model/paddleocr/total_text/test
+    label_file_list:
+    - D:/MyCode/Python/Model/paddleocr/total_text/test/test_rec.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - RecResizeImg:
+        image_shape: [3, 48, 320]
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  loader:
+    shuffle: false
+    drop_last: false
+    batch_size_per_card: 128
+    num_workers: 4

convert_format.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import json
+with open(r'D:\MyCode\Python\Model\paddleocr\total_text\test\train.txt', 'r', encoding='utf-8') as f, open(r'D:\MyCode\Python\Model\paddleocr\total_text\train\train_rec.txt', 'w', encoding='utf-8') as out_f:
+    for line in f:
+        parts = line.strip().split('\t')
+        if len(parts) != 2:
+            continue  # bỏ qua dòng lỗi
+        img_path, annotations = parts
+        try:
+            ann_list = json.loads(annotations)
+            for ann in ann_list:
+                text = ann.get("transcription", "").strip()
+                if text:
+                    out_f.write(f"{img_path}\t{text}\n")
+        except json.JSONDecodeError:
+            print(f"Lỗi JSON ở dòng: {line}")

convert_format_test.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import json
+import cv2
+import numpy as np
+input_label_file = "D:/MyCode/Python/Model/paddleocr/total_text/test/test.txt"
+image_root = "D:/MyCode/Python/Model/paddleocr/total_text/test"
+output_label_file = "D:/MyCode/Python/Model/paddleocr/total_text/test/test_rec.txt"
+crop_output_dir = os.path.join(image_root, "rec_crop")
+os.makedirs(crop_output_dir, exist_ok=True)
+with open(input_label_file, "r", encoding="utf-8") as f:
+    lines = f.readlines()
+out_lines = []
+crop_id = 0
+for line in lines:
+    img_path_rel, anns = line.strip().split('\t')
+    img_path = os.path.join(image_root, img_path_rel)
+    anns = json.loads(anns)
+    if not os.path.exists(img_path):
+        print(f"[WARNING] Không tìm thấy ảnh: {img_path}")
+        continue
+    img = cv2.imread(img_path)
+    if img is None:
+        print(f"[WARNING] Lỗi đọc ảnh: {img_path}")
+        continue
+    height, width = img.shape[:2]
+    for ann in anns:
+        text = ann['transcription']
+        points = ann['points']
+        if text.strip().lower() == "###" or not text.strip():
+            continue
+        pts = np.array(points, dtype="float32")
+        x, y, w, h = cv2.boundingRect(pts.astype("int"))
+        # Giới hạn x, y, w, h nằm trong ảnh
+        x = max(0, x)
+        y = max(0, y)
+        if x + w > width or y + h > height:
+            print(f"[WARNING] Box vượt quá kích thước ảnh ({img_path}): x={x}, y={y}, w={w}, h={h}")
+            continue
+        cropped = img[y:y+h, x:x+w]
+        if cropped is None or cropped.size == 0:
+            print(f"[WARNING] Ảnh crop rỗng ({img_path}), bỏ qua.")
+            continue
+        crop_img_name = f"{os.path.splitext(os.path.basename(img_path))[0]}_crop_{crop_id}.jpg"
+        crop_img_path = os.path.join(crop_output_dir, crop_img_name)
+        cv2.imwrite(crop_img_path, cropped)
+        out_line = f"rec_crop/{crop_img_name}\t{text.strip()}"
+        out_lines.append(out_line)
+        crop_id += 1
+with open(output_label_file, "w", encoding="utf-8") as f:
+    f.write('\n'.join(out_lines))
+print(f"✅ Đã tạo {len(out_lines)} mẫu recognition tại: {output_label_file}")