Mayo commited on
Commit
cec8437
·
unverified ·
1 Parent(s): 233b9bb

chore: add refine manga109 script

Browse files
Files changed (2) hide show
  1. scripts/manga109_yolo.py +0 -160
  2. scripts/refine_manga109.py +828 -0
scripts/manga109_yolo.py DELETED
@@ -1,160 +0,0 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import argparse
4
- import manga109api
5
- import shutil
6
- import random
7
- import math
8
-
9
-
10
- def convert_to_yolo_format(x_min, y_min, x_max, y_max, img_width, img_height):
11
- """Convert bounding box from Manga109 format to YOLO format."""
12
- x_center = ((x_min + x_max) / 2) / img_width
13
- y_center = ((y_min + y_max) / 2) / img_height
14
- width = (x_max - x_min) / img_width
15
- height = (y_max - y_min) / img_height
16
-
17
- return x_center, y_center, width, height
18
-
19
-
20
- def process_annotation(ann, class_id, img_width, img_height, out_file):
21
- """Process a single annotation and write to output file."""
22
- x_min = int(ann["@xmin"])
23
- y_min = int(ann["@ymin"])
24
- x_max = int(ann["@xmax"])
25
- y_max = int(ann["@ymax"])
26
-
27
- x_center, y_center, width, height = convert_to_yolo_format(
28
- x_min, y_min, x_max, y_max, img_width, img_height
29
- )
30
-
31
- out_file.write(
32
- f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n"
33
- )
34
-
35
-
36
- def manga109_to_yolo(manga109_root_dir, output_dir):
37
- """Convert Manga109 annotations to YOLO format with 80/20 train/val split."""
38
- # Initialize parser
39
- parser = manga109api.Parser(root_dir=manga109_root_dir)
40
-
41
- # Define class mapping
42
- class_map = {"frame": 0, "text": 1}
43
-
44
- # Create directory structure
45
- os.makedirs(os.path.join(output_dir, "images", "train"), exist_ok=True)
46
- os.makedirs(os.path.join(output_dir, "images", "val"), exist_ok=True)
47
- os.makedirs(os.path.join(output_dir, "labels", "train"), exist_ok=True)
48
- os.makedirs(os.path.join(output_dir, "labels", "val"), exist_ok=True)
49
-
50
- # Write class names file
51
- with open(os.path.join(output_dir, "classes.txt"), "w") as f:
52
- for class_name in ["frame", "text"]:
53
- f.write(f"{class_name}\n")
54
-
55
- book_list = parser.books
56
-
57
- # Shuffle books to ensure random distribution
58
- random.shuffle(book_list)
59
-
60
- # Calculate the split point (80% for training, 20% for validation)
61
- split_idx = math.ceil(len(book_list) * 0.8)
62
- train_books = book_list[:split_idx]
63
- val_books = book_list[split_idx:]
64
-
65
- print(f"Training books: {len(train_books)}")
66
- print(f"Validation books: {len(val_books)}")
67
-
68
- # Process training books
69
- process_books(parser, train_books, output_dir, class_map, "train")
70
-
71
- # Process validation books
72
- process_books(parser, val_books, output_dir, class_map, "val")
73
-
74
- # Create YAML configuration file for YOLO
75
- yaml_path = os.path.join(output_dir, "data.yaml")
76
- with open(yaml_path, "w") as f:
77
- f.write(f"path: {os.path.abspath(output_dir)}\n")
78
- f.write("train: images/train\n")
79
- f.write("val: images/val\n\n")
80
-
81
- f.write("names:\n")
82
- for i, name in enumerate(["frame", "text"]):
83
- f.write(f" {i}: {name}\n")
84
-
85
-
86
- def process_books(parser, book_list, output_dir, class_map, split_type):
87
- """Process books for either train or val split."""
88
- for book in book_list:
89
- print(f"Processing {book} for {split_type}...")
90
-
91
- # Get annotation data
92
- annotation = parser.get_annotation(book=book)
93
-
94
- # Process each page in the book
95
- for page in annotation["page"]:
96
- page_idx = page["@index"]
97
- img_width = int(page["@width"])
98
- img_height = int(page["@height"])
99
-
100
- # Create unique filename
101
- filename = f"{book}_{page_idx:03d}"
102
-
103
- # Copy the image
104
- img_src_path = parser.img_path(book=book, index=page_idx)
105
- img_dst_path = os.path.join(
106
- output_dir, "images", split_type, f"{filename}.jpg"
107
- )
108
-
109
- if os.path.exists(img_src_path):
110
- shutil.copy2(img_src_path, img_dst_path)
111
-
112
- # Create annotation file
113
- label_path = os.path.join(
114
- output_dir, "labels", split_type, f"{filename}.txt"
115
- )
116
-
117
- with open(label_path, "w") as f:
118
- # Process each annotation type
119
- for ann_type in ["frame", "text"]:
120
- if ann_type in page:
121
- # Handle both single annotation and list of annotations
122
- annotations = page[ann_type]
123
- if not isinstance(annotations, list):
124
- annotations = [annotations]
125
-
126
- for ann in annotations:
127
- process_annotation(
128
- ann, class_map[ann_type], img_width, img_height, f
129
- )
130
-
131
-
132
- def main():
133
- parser = argparse.ArgumentParser(
134
- description="Convert Manga109 dataset to YOLO format with 80/20 train/val split"
135
- )
136
- parser.add_argument(
137
- "--manga109_dir", required=True, help="Path to Manga109 dataset root directory"
138
- )
139
- parser.add_argument(
140
- "--output_dir",
141
- required=True,
142
- help="Output directory for YOLO-formatted dataset",
143
- )
144
- parser.add_argument(
145
- "--seed", type=int, default=42, help="Random seed for dataset splitting"
146
- )
147
-
148
- args = parser.parse_args()
149
-
150
- # Set random seed for reproducibility
151
- random.seed(args.seed)
152
-
153
- manga109_to_yolo(args.manga109_dir, args.output_dir)
154
-
155
- print(f"Conversion complete! Output saved to {args.output_dir}")
156
- print(f"Dataset split: 80% training, 20% validation")
157
-
158
-
159
- if __name__ == "__main__":
160
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/refine_manga109.py ADDED
@@ -0,0 +1,828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Refine Manga109 annotations into PaddleOCR-ready detection/recognition data."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import math
9
+ import os
10
+ import random
11
+ import shutil
12
+ import sys
13
+ import types
14
+ import xml.etree.ElementTree as ET
15
+ from collections import Counter
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Iterable, Sequence
19
+
20
+ import cv2
21
+ import numpy as np
22
+
23
+ try:
24
+ from tqdm import tqdm
25
+ except Exception: # pragma: no cover
26
+ tqdm = None
27
+
28
+
29
+ IMAGE_EXT = ".jpg"
30
+ DEFAULT_SPLIT_SPEC = "87,11,11"
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class Box:
35
+ x1: int
36
+ y1: int
37
+ x2: int
38
+ y2: int
39
+
40
+ @property
41
+ def width(self) -> int:
42
+ return max(0, self.x2 - self.x1)
43
+
44
+ @property
45
+ def height(self) -> int:
46
+ return max(0, self.y2 - self.y1)
47
+
48
+ @property
49
+ def area(self) -> int:
50
+ return self.width * self.height
51
+
52
+ @property
53
+ def center_x(self) -> float:
54
+ return self.x1 + self.width / 2.0
55
+
56
+ @property
57
+ def center_y(self) -> float:
58
+ return self.y1 + self.height / 2.0
59
+
60
+ def expand(self, image_shape: tuple[int, int, int], ratio: float = 0.06, min_pad: int = 4) -> "Box":
61
+ pad_x = max(min_pad, int(round(self.width * ratio)))
62
+ pad_y = max(min_pad, int(round(self.height * ratio)))
63
+ h, w = image_shape[:2]
64
+ return Box(
65
+ max(0, self.x1 - pad_x),
66
+ max(0, self.y1 - pad_y),
67
+ min(w, self.x2 + pad_x),
68
+ min(h, self.y2 + pad_y),
69
+ )
70
+
71
+ def intersection_area(self, other: "Box") -> int:
72
+ x1 = max(self.x1, other.x1)
73
+ y1 = max(self.y1, other.y1)
74
+ x2 = min(self.x2, other.x2)
75
+ y2 = min(self.y2, other.y2)
76
+ if x2 <= x1 or y2 <= y1:
77
+ return 0
78
+ return (x2 - x1) * (y2 - y1)
79
+
80
+ def iou(self, other: "Box") -> float:
81
+ inter = self.intersection_area(other)
82
+ if inter <= 0:
83
+ return 0.0
84
+ union = self.area + other.area - inter
85
+ return inter / max(union, 1)
86
+
87
+ def overlap_ratio(self, other: "Box") -> float:
88
+ inter = self.intersection_area(other)
89
+ return inter / max(self.area, 1)
90
+
91
+ def contains_center(self, other: "Box") -> bool:
92
+ return self.x1 <= other.center_x <= self.x2 and self.y1 <= other.center_y <= self.y2
93
+
94
+ def to_quad(self) -> list[list[int]]:
95
+ return [
96
+ [self.x1, self.y1],
97
+ [self.x2, self.y1],
98
+ [self.x2, self.y2],
99
+ [self.x1, self.y2],
100
+ ]
101
+
102
+ def to_list(self) -> list[int]:
103
+ return [self.x1, self.y1, self.x2, self.y2]
104
+
105
+
106
+ @dataclass
107
+ class OriginalText:
108
+ text_id: str
109
+ bbox: Box
110
+ transcript: str
111
+ orientation: str
112
+
113
+
114
+ @dataclass
115
+ class CTDBlock:
116
+ bbox: Box
117
+ quad: list[list[int]]
118
+ line_polygons: list[list[list[int]]]
119
+ vertical: bool
120
+ score: float
121
+ support: float
122
+
123
+
124
+ def parse_args() -> argparse.Namespace:
125
+ parser = argparse.ArgumentParser(
126
+ description="Refine Manga109 annotations using OpenCV candidates and comic-text-detector."
127
+ )
128
+ parser.add_argument("--dataset-root", default="data/Manga109_released_2021_12_30")
129
+ parser.add_argument("--output-root", default="data/manga109_refined_paddleocr")
130
+ parser.add_argument("--ctd-root", default="temp/comic-text-detector")
131
+ parser.add_argument("--model-path", default="temp/comic-text-detector/data/comictextdetector.pt")
132
+ parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"])
133
+ parser.add_argument("--seed", type=int, default=42)
134
+ parser.add_argument("--split-spec", default=DEFAULT_SPLIT_SPEC)
135
+ parser.add_argument("--overwrite", action="store_true")
136
+ parser.add_argument("--book-limit", type=int, default=0)
137
+ parser.add_argument("--page-limit", type=int, default=0)
138
+ parser.add_argument("--ctd-input-size", type=int, default=1024)
139
+ parser.add_argument("--ctd-conf-thresh", type=float, default=0.4)
140
+ parser.add_argument("--ctd-nms-thresh", type=float, default=0.35)
141
+ parser.add_argument("--cv2-min-area-ratio", type=float, default=0.015)
142
+ parser.add_argument("--cv2-max-area-ratio", type=float, default=0.95)
143
+ parser.add_argument("--cv2-max-candidates", type=int, default=8)
144
+ return parser.parse_args()
145
+
146
+
147
+ def iter_with_progress(iterable: Sequence, desc: str) -> Iterable:
148
+ if tqdm is None:
149
+ return iterable
150
+ return tqdm(iterable, desc=desc)
151
+
152
+
153
+ def install_ctd_compat_shims() -> None:
154
+ aliases = {
155
+ "bool8": np.bool_,
156
+ "float_": np.float64,
157
+ "int_": np.int64,
158
+ "uint": np.uint64,
159
+ }
160
+ for name, value in aliases.items():
161
+ if not hasattr(np, name):
162
+ setattr(np, name, value)
163
+
164
+ if "wandb" not in sys.modules:
165
+ sys.modules["wandb"] = types.SimpleNamespace(init=lambda *args, **kwargs: None)
166
+
167
+ if "torchsummary" not in sys.modules:
168
+ torchsummary = types.ModuleType("torchsummary")
169
+ torchsummary.summary = lambda *args, **kwargs: None
170
+ sys.modules["torchsummary"] = torchsummary
171
+
172
+
173
+ def load_ctd_detector(
174
+ ctd_root: Path,
175
+ model_path: Path,
176
+ device: str,
177
+ input_size: int,
178
+ conf_thresh: float,
179
+ nms_thresh: float,
180
+ ):
181
+ install_ctd_compat_shims()
182
+ sys.path.insert(0, str(ctd_root.resolve()))
183
+ from inference import TextDetector # type: ignore
184
+ import torch
185
+
186
+ if device == "cuda" and not torch.cuda.is_available():
187
+ raise RuntimeError("CUDA was requested but torch.cuda.is_available() is false.")
188
+
189
+ return TextDetector(
190
+ model_path=str(model_path.resolve()),
191
+ input_size=input_size,
192
+ device=device,
193
+ conf_thresh=conf_thresh,
194
+ nms_thresh=nms_thresh,
195
+ act="leaky",
196
+ )
197
+
198
+
199
+ def load_books(dataset_root: Path) -> list[str]:
200
+ books_file = dataset_root / "books.txt"
201
+ return [line.strip() for line in books_file.read_text(encoding="utf-8").splitlines() if line.strip()]
202
+
203
+
204
+ def compute_split_counts(total_books: int, spec: str) -> tuple[int, int, int]:
205
+ weights = [int(part.strip()) for part in spec.split(",")]
206
+ if len(weights) != 3 or any(weight < 0 for weight in weights):
207
+ raise ValueError(f"Invalid split spec: {spec}")
208
+ if total_books <= 0:
209
+ return 0, 0, 0
210
+
211
+ weight_sum = sum(weights)
212
+ raw = [total_books * weight / weight_sum for weight in weights]
213
+ counts = [math.floor(value) for value in raw]
214
+ remainder = total_books - sum(counts)
215
+ order = sorted(
216
+ range(3),
217
+ key=lambda idx: (raw[idx] - counts[idx], weights[idx]),
218
+ reverse=True,
219
+ )
220
+ for idx in order[:remainder]:
221
+ counts[idx] += 1
222
+
223
+ if total_books >= 3:
224
+ for idx in range(3):
225
+ if counts[idx] == 0:
226
+ donor = max(range(3), key=lambda j: counts[j])
227
+ if counts[donor] > 1:
228
+ counts[donor] -= 1
229
+ counts[idx] += 1
230
+
231
+ return counts[0], counts[1], counts[2]
232
+
233
+
234
+ def split_books(books: list[str], seed: int, spec: str) -> dict[str, list[str]]:
235
+ rng = random.Random(seed)
236
+ shuffled = list(books)
237
+ rng.shuffle(shuffled)
238
+ train_count, val_count, test_count = compute_split_counts(len(shuffled), spec)
239
+ train_books = shuffled[:train_count]
240
+ val_books = shuffled[train_count : train_count + val_count]
241
+ test_books = shuffled[train_count + val_count : train_count + val_count + test_count]
242
+ return {"train": train_books, "val": val_books, "test": test_books}
243
+
244
+
245
+ def ensure_clean_dir(path: Path, overwrite: bool) -> None:
246
+ if path.exists() and overwrite:
247
+ shutil.rmtree(path)
248
+ path.mkdir(parents=True, exist_ok=True)
249
+
250
+
251
+ def hardlink_or_copy(src: Path, dst: Path) -> None:
252
+ if dst.exists():
253
+ return
254
+ dst.parent.mkdir(parents=True, exist_ok=True)
255
+ try:
256
+ os.link(src, dst)
257
+ except Exception:
258
+ shutil.copy2(src, dst)
259
+
260
+
261
+ def parse_original_texts(page: ET.Element) -> list[OriginalText]:
262
+ texts: list[OriginalText] = []
263
+ for text in page.findall("./text"):
264
+ transcript = (text.text or "").strip()
265
+ if not transcript:
266
+ continue
267
+ bbox = Box(
268
+ int(text.attrib["xmin"]),
269
+ int(text.attrib["ymin"]),
270
+ int(text.attrib["xmax"]),
271
+ int(text.attrib["ymax"]),
272
+ )
273
+ orientation = "vertical" if bbox.height >= bbox.width else "horizontal"
274
+ texts.append(
275
+ OriginalText(
276
+ text_id=text.attrib["id"],
277
+ bbox=bbox,
278
+ transcript=transcript,
279
+ orientation=orientation,
280
+ )
281
+ )
282
+ return texts
283
+
284
+
285
+ def order_points_clockwise(points: np.ndarray) -> list[list[int]]:
286
+ points = np.asarray(points, dtype=np.float32)
287
+ center = points.mean(axis=0)
288
+ angles = np.arctan2(points[:, 1] - center[1], points[:, 0] - center[0])
289
+ ordered = points[np.argsort(angles)]
290
+ start_idx = int(np.argmin(ordered.sum(axis=1)))
291
+ ordered = np.roll(ordered, -start_idx, axis=0)
292
+ return [[int(round(point[0])), int(round(point[1]))] for point in ordered]
293
+
294
+
295
+ def quad_from_line_polygons(line_polygons: Sequence[Sequence[Sequence[int]]], fallback_box: Box) -> list[list[int]]:
296
+ if not line_polygons:
297
+ return fallback_box.to_quad()
298
+ points = np.array(line_polygons, dtype=np.float32).reshape(-1, 2)
299
+ rect = cv2.minAreaRect(points)
300
+ quad = cv2.boxPoints(rect)
301
+ return order_points_clockwise(quad)
302
+
303
+
304
+ def merge_overlapping_boxes(boxes: Sequence[Box], iou_thresh: float, expand_px: int = 0) -> list[Box]:
305
+ merged: list[Box] = []
306
+ for box in sorted(boxes, key=lambda item: item.area, reverse=True):
307
+ matched = False
308
+ for idx, existing in enumerate(merged):
309
+ compare_existing = Box(
310
+ existing.x1 - expand_px,
311
+ existing.y1 - expand_px,
312
+ existing.x2 + expand_px,
313
+ existing.y2 + expand_px,
314
+ )
315
+ compare_box = Box(
316
+ box.x1 - expand_px,
317
+ box.y1 - expand_px,
318
+ box.x2 + expand_px,
319
+ box.y2 + expand_px,
320
+ )
321
+ if compare_existing.iou(compare_box) >= iou_thresh or compare_existing.overlap_ratio(compare_box) >= 0.6:
322
+ merged[idx] = Box(
323
+ min(existing.x1, box.x1),
324
+ min(existing.y1, box.y1),
325
+ max(existing.x2, box.x2),
326
+ max(existing.y2, box.y2),
327
+ )
328
+ matched = True
329
+ break
330
+ if not matched:
331
+ merged.append(box)
332
+ return merged
333
+
334
+
335
+ def connected_text_candidates(
336
+ image: np.ndarray,
337
+ parent_box: Box,
338
+ min_area_ratio: float,
339
+ max_area_ratio: float,
340
+ max_candidates: int,
341
+ ) -> list[Box]:
342
+ crop_box = parent_box.expand(image.shape, ratio=0.08, min_pad=6)
343
+ crop = image[crop_box.y1 : crop_box.y2, crop_box.x1 : crop_box.x2]
344
+ if crop.size == 0:
345
+ return []
346
+
347
+ gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
348
+ orientation = "vertical" if parent_box.height >= parent_box.width else "horizontal"
349
+ parent_area = max(parent_box.area, 1)
350
+
351
+ if orientation == "vertical":
352
+ primary_kernel = cv2.getStructuringElement(
353
+ cv2.MORPH_RECT,
354
+ (3, max(9, int(round(parent_box.height * 0.12)))),
355
+ )
356
+ else:
357
+ primary_kernel = cv2.getStructuringElement(
358
+ cv2.MORPH_RECT,
359
+ (max(9, int(round(parent_box.width * 0.12))), 3),
360
+ )
361
+ cleanup_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
362
+
363
+ candidates: list[Box] = []
364
+ for source in (gray, 255 - gray):
365
+ binary = cv2.adaptiveThreshold(
366
+ source,
367
+ 255,
368
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
369
+ cv2.THRESH_BINARY_INV,
370
+ 31,
371
+ 11,
372
+ )
373
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, cleanup_kernel)
374
+ merged = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, cleanup_kernel)
375
+ merged = cv2.dilate(merged, primary_kernel, iterations=1)
376
+
377
+ num_labels, _, stats, _ = cv2.connectedComponentsWithStats(merged, connectivity=8)
378
+ for label in range(1, num_labels):
379
+ x, y, w, h, area = stats[label].tolist()
380
+ bbox_area = max(1, w * h)
381
+ if bbox_area < parent_area * min_area_ratio or bbox_area > parent_area * max_area_ratio:
382
+ continue
383
+ if w < 6 or h < 6:
384
+ continue
385
+ density = area / bbox_area
386
+ if density < 0.10:
387
+ continue
388
+ box = Box(crop_box.x1 + x, crop_box.y1 + y, crop_box.x1 + x + w, crop_box.y1 + y + h)
389
+ candidates.append(box)
390
+
391
+ merged_candidates = merge_overlapping_boxes(candidates, iou_thresh=0.20, expand_px=6)
392
+ merged_candidates.sort(key=lambda box: box.area, reverse=True)
393
+ return merged_candidates[:max_candidates]
394
+
395
+
396
+ def reading_order(boxes: Sequence[Box], orientation: str) -> list[int]:
397
+ indexed = list(enumerate(boxes))
398
+ if orientation == "vertical":
399
+ indexed.sort(key=lambda item: (-item[1].center_x, item[1].y1))
400
+ else:
401
+ indexed.sort(key=lambda item: (item[1].y1, item[1].x1))
402
+ return [idx for idx, _ in indexed]
403
+
404
+
405
+ def split_transcript(transcript: str) -> list[str]:
406
+ return [part.strip() for part in transcript.splitlines() if part.strip()]
407
+
408
+
409
+ def sanitize_filename(value: str) -> str:
410
+ safe = []
411
+ for char in value:
412
+ if char.isalnum() or char in "-_.":
413
+ safe.append(char)
414
+ else:
415
+ safe.append("_")
416
+ return "".join(safe)
417
+
418
+
419
+ def ctd_blocks_for_page(blk_list: Sequence) -> list[dict]:
420
+ blocks: list[dict] = []
421
+ for block in blk_list:
422
+ bbox = Box(int(block.xyxy[0]), int(block.xyxy[1]), int(block.xyxy[2]), int(block.xyxy[3]))
423
+ line_polygons = []
424
+ for line in block.lines:
425
+ polygon = [[int(point[0]), int(point[1])] for point in line]
426
+ line_polygons.append(polygon)
427
+ blocks.append(
428
+ {
429
+ "bbox": bbox,
430
+ "line_polygons": line_polygons,
431
+ "vertical": bool(block.vertical),
432
+ }
433
+ )
434
+ return blocks
435
+
436
+
437
+ def select_ctd_blocks(
438
+ parent: OriginalText,
439
+ ctd_blocks: Sequence[dict],
440
+ cv2_candidates: Sequence[Box],
441
+ ) -> list[CTDBlock]:
442
+ expanded_parent = Box(
443
+ parent.bbox.x1 - 8,
444
+ parent.bbox.y1 - 8,
445
+ parent.bbox.x2 + 8,
446
+ parent.bbox.y2 + 8,
447
+ )
448
+ chosen: list[CTDBlock] = []
449
+ for block in ctd_blocks:
450
+ bbox: Box = block["bbox"]
451
+ inter_parent = bbox.intersection_area(expanded_parent)
452
+ if inter_parent <= 0:
453
+ continue
454
+
455
+ in_parent_ratio = inter_parent / max(bbox.area, 1)
456
+ parent_cover_ratio = inter_parent / max(parent.bbox.area, 1)
457
+ center_inside = expanded_parent.contains_center(bbox)
458
+ if not center_inside and in_parent_ratio < 0.30 and parent_cover_ratio < 0.08:
459
+ continue
460
+
461
+ best_candidate_cover = 0.0
462
+ best_candidate_iou = 0.0
463
+ for candidate in cv2_candidates:
464
+ inter_candidate = bbox.intersection_area(candidate)
465
+ if inter_candidate <= 0:
466
+ continue
467
+ best_candidate_cover = max(best_candidate_cover, inter_candidate / max(bbox.area, 1))
468
+ best_candidate_iou = max(best_candidate_iou, bbox.iou(candidate))
469
+
470
+ line_count = len(block["line_polygons"])
471
+ candidate_support = max(best_candidate_cover, best_candidate_iou)
472
+ score = in_parent_ratio * 0.55 + parent_cover_ratio * 0.15 + candidate_support * 0.20 + min(line_count, 4) * 0.05
473
+ if center_inside:
474
+ score += 0.10
475
+
476
+ is_tiny = bbox.area < max(100, int(parent.bbox.area * 0.03))
477
+ if is_tiny and candidate_support < 0.22 and line_count <= 1:
478
+ continue
479
+ if candidate_support < 0.12 and in_parent_ratio < 0.55 and line_count <= 1:
480
+ continue
481
+
482
+ chosen.append(
483
+ CTDBlock(
484
+ bbox=bbox,
485
+ quad=quad_from_line_polygons(block["line_polygons"], bbox),
486
+ line_polygons=block["line_polygons"],
487
+ vertical=bool(block["vertical"]),
488
+ score=score,
489
+ support=candidate_support,
490
+ )
491
+ )
492
+
493
+ chosen.sort(key=lambda item: (item.score, item.bbox.area), reverse=True)
494
+ deduped: list[CTDBlock] = []
495
+ for block in chosen:
496
+ duplicate = False
497
+ for existing in deduped:
498
+ if block.bbox.iou(existing.bbox) >= 0.65:
499
+ duplicate = True
500
+ break
501
+ inter = block.bbox.intersection_area(existing.bbox)
502
+ smaller = min(block.bbox.area, existing.bbox.area)
503
+ if smaller > 0 and inter / smaller >= 0.80:
504
+ duplicate = True
505
+ break
506
+ if not duplicate:
507
+ deduped.append(block)
508
+ return deduped
509
+
510
+
511
+ def final_blocks_for_text(
512
+ parent: OriginalText,
513
+ ctd_matches: Sequence[CTDBlock],
514
+ ) -> tuple[str, list[dict], list[str]]:
515
+ if not ctd_matches:
516
+ return (
517
+ "keep_original",
518
+ [
519
+ {
520
+ "bbox": parent.bbox,
521
+ "quad": parent.bbox.to_quad(),
522
+ "transcription": parent.transcript,
523
+ "source": "original",
524
+ "orientation": parent.orientation,
525
+ }
526
+ ],
527
+ [],
528
+ )
529
+
530
+ orientation = "vertical" if sum(1 for item in ctd_matches if item.vertical) >= len(ctd_matches) / 2 else "horizontal"
531
+ order = reading_order([item.bbox for item in ctd_matches], orientation)
532
+ ordered_matches = [ctd_matches[idx] for idx in order]
533
+ transcript_segments = split_transcript(parent.transcript)
534
+
535
+ if len(ordered_matches) == 1:
536
+ block = ordered_matches[0]
537
+ return (
538
+ "refined_single",
539
+ [
540
+ {
541
+ "bbox": block.bbox,
542
+ "quad": block.quad,
543
+ "transcription": parent.transcript,
544
+ "source": "ctd",
545
+ "orientation": orientation,
546
+ "score": round(block.score, 4),
547
+ "support": round(block.support, 4),
548
+ }
549
+ ],
550
+ transcript_segments,
551
+ )
552
+
553
+ if transcript_segments and len(transcript_segments) == len(ordered_matches):
554
+ final = []
555
+ for segment, block in zip(transcript_segments, ordered_matches):
556
+ final.append(
557
+ {
558
+ "bbox": block.bbox,
559
+ "quad": block.quad,
560
+ "transcription": segment,
561
+ "source": "ctd_split",
562
+ "orientation": orientation,
563
+ "score": round(block.score, 4),
564
+ "support": round(block.support, 4),
565
+ }
566
+ )
567
+ return "refined_split", final, transcript_segments
568
+
569
+ line_counts = [max(1, len(block.line_polygons)) for block in ordered_matches]
570
+ if transcript_segments and sum(line_counts) == len(transcript_segments):
571
+ final = []
572
+ cursor = 0
573
+ grouped_segments: list[str] = []
574
+ for block, line_count in zip(ordered_matches, line_counts):
575
+ segment = "\n".join(transcript_segments[cursor : cursor + line_count])
576
+ cursor += line_count
577
+ grouped_segments.append(segment)
578
+ final.append(
579
+ {
580
+ "bbox": block.bbox,
581
+ "quad": block.quad,
582
+ "transcription": segment,
583
+ "source": "ctd_split_grouped",
584
+ "orientation": orientation,
585
+ "score": round(block.score, 4),
586
+ "support": round(block.support, 4),
587
+ }
588
+ )
589
+ return "refined_split_grouped", final, grouped_segments
590
+
591
+ return (
592
+ "keep_original_split_mismatch",
593
+ [
594
+ {
595
+ "bbox": parent.bbox,
596
+ "quad": parent.bbox.to_quad(),
597
+ "transcription": parent.transcript,
598
+ "source": "original",
599
+ "orientation": parent.orientation,
600
+ }
601
+ ],
602
+ transcript_segments,
603
+ )
604
+
605
+
606
+ def write_crop(image: np.ndarray, bbox: Box, output_path: Path) -> None:
607
+ crop = image[bbox.y1 : bbox.y2, bbox.x1 : bbox.x2]
608
+ if crop.size == 0:
609
+ return
610
+ output_path.parent.mkdir(parents=True, exist_ok=True)
611
+ cv2.imwrite(str(output_path), crop)
612
+
613
+
614
+ def page_label_line(image_rel_path: str, entries: list[dict]) -> str:
615
+ payload = [
616
+ {"transcription": entry["transcription"], "points": entry["points"]}
617
+ for entry in entries
618
+ ]
619
+ return f"{image_rel_path}\t{json.dumps(payload, ensure_ascii=False)}"
620
+
621
+
622
+ def main() -> None:
623
+ args = parse_args()
624
+
625
+ dataset_root = Path(args.dataset_root)
626
+ output_root = Path(args.output_root)
627
+ ctd_root = Path(args.ctd_root)
628
+ model_path = Path(args.model_path)
629
+
630
+ if not dataset_root.exists():
631
+ raise FileNotFoundError(f"Manga109 root not found: {dataset_root}")
632
+ if not model_path.exists():
633
+ raise FileNotFoundError(f"comictextdetector.pt not found: {model_path}")
634
+
635
+ ensure_clean_dir(output_root, overwrite=args.overwrite)
636
+ (output_root / "det").mkdir(parents=True, exist_ok=True)
637
+ (output_root / "rec").mkdir(parents=True, exist_ok=True)
638
+ (output_root / "images").mkdir(parents=True, exist_ok=True)
639
+ (output_root / "manifests").mkdir(parents=True, exist_ok=True)
640
+ (output_root / "stats").mkdir(parents=True, exist_ok=True)
641
+
642
+ books = load_books(dataset_root)
643
+ if args.book_limit > 0:
644
+ books = books[: args.book_limit]
645
+ split_map = split_books(books, seed=args.seed, spec=args.split_spec)
646
+
647
+ detector = load_ctd_detector(
648
+ ctd_root=ctd_root,
649
+ model_path=model_path,
650
+ device=args.device,
651
+ input_size=args.ctd_input_size,
652
+ conf_thresh=args.ctd_conf_thresh,
653
+ nms_thresh=args.ctd_nms_thresh,
654
+ )
655
+
656
+ summary = {
657
+ "dataset_root": str(dataset_root.resolve()),
658
+ "output_root": str(output_root.resolve()),
659
+ "device": args.device,
660
+ "model_path": str(model_path.resolve()),
661
+ "split_spec": args.split_spec,
662
+ "seed": args.seed,
663
+ "books": {},
664
+ "global": Counter(),
665
+ }
666
+
667
+ annotations_root = dataset_root / "annotations"
668
+ images_root = dataset_root / "images"
669
+
670
+ for split, split_books_list in split_map.items():
671
+ det_lines: list[str] = []
672
+ rec_lines: list[str] = []
673
+ page_manifest_path = output_root / "manifests" / f"pages.{split}.jsonl"
674
+ text_manifest_path = output_root / "manifests" / f"texts.{split}.jsonl"
675
+ split_counter = Counter()
676
+
677
+ with (
678
+ page_manifest_path.open("w", encoding="utf-8") as page_manifest,
679
+ text_manifest_path.open("w", encoding="utf-8") as text_manifest,
680
+ ):
681
+ for book in iter_with_progress(split_books_list, f"{split} books"):
682
+ split_counter["books"] += 1
683
+ xml_path = annotations_root / f"{book}.xml"
684
+ image_dir = images_root / book
685
+
686
+ tree = ET.parse(xml_path)
687
+ pages = tree.getroot().findall("./pages/page")
688
+ if args.page_limit > 0:
689
+ pages = pages[: args.page_limit]
690
+
691
+ for page in pages:
692
+ page_index = int(page.attrib["index"])
693
+ image_path = image_dir / f"{page_index:03d}{IMAGE_EXT}"
694
+ image_rel_path = Path("images") / split / book / image_path.name
695
+ output_image_path = output_root / image_rel_path
696
+ hardlink_or_copy(image_path, output_image_path)
697
+
698
+ image = cv2.imread(str(image_path), cv2.IMREAD_COLOR)
699
+ if image is None:
700
+ continue
701
+
702
+ original_texts = parse_original_texts(page)
703
+ split_counter["pages"] += 1
704
+ split_counter["original_texts"] += len(original_texts)
705
+
706
+ _, _, blk_list = detector(image)
707
+ page_ctd_blocks = ctd_blocks_for_page(blk_list)
708
+ split_counter["ctd_blocks"] += len(page_ctd_blocks)
709
+
710
+ page_det_entries: list[dict] = []
711
+ page_manifest_record = {
712
+ "book_title": book,
713
+ "page_index": page_index,
714
+ "image_path": image_rel_path.as_posix(),
715
+ "original_text_count": len(original_texts),
716
+ "ctd_block_count": len(page_ctd_blocks),
717
+ "texts": [],
718
+ }
719
+
720
+ for original in original_texts:
721
+ cv2_candidates = connected_text_candidates(
722
+ image=image,
723
+ parent_box=original.bbox,
724
+ min_area_ratio=args.cv2_min_area_ratio,
725
+ max_area_ratio=args.cv2_max_area_ratio,
726
+ max_candidates=args.cv2_max_candidates,
727
+ )
728
+ split_counter["cv2_candidates"] += len(cv2_candidates)
729
+
730
+ ctd_matches = select_ctd_blocks(
731
+ parent=original,
732
+ ctd_blocks=page_ctd_blocks,
733
+ cv2_candidates=cv2_candidates,
734
+ )
735
+ action, final_blocks, transcript_segments = final_blocks_for_text(
736
+ parent=original,
737
+ ctd_matches=ctd_matches,
738
+ )
739
+ split_counter[action] += 1
740
+ split_counter["final_blocks"] += len(final_blocks)
741
+
742
+ manifest_blocks = []
743
+ for block_idx, block in enumerate(final_blocks):
744
+ bbox: Box = block["bbox"]
745
+ quad = block["quad"]
746
+ transcription = block["transcription"]
747
+ manifest_blocks.append(
748
+ {
749
+ "bbox_xyxy": bbox.to_list(),
750
+ "quad_clockwise": quad,
751
+ "transcription": transcription,
752
+ "source": block["source"],
753
+ "orientation": block["orientation"],
754
+ "score": block.get("score"),
755
+ "support": block.get("support"),
756
+ }
757
+ )
758
+ page_det_entries.append(
759
+ {
760
+ "points": quad,
761
+ "transcription": transcription,
762
+ }
763
+ )
764
+
765
+ crop_name = (
766
+ f"{sanitize_filename(book)}_{page_index:03d}_"
767
+ f"{sanitize_filename(original.text_id)}_{block_idx:02d}.png"
768
+ )
769
+ crop_rel_path = Path("rec") / split / crop_name
770
+ crop_output_path = output_root / crop_rel_path
771
+ write_crop(image, bbox, crop_output_path)
772
+ if crop_output_path.exists():
773
+ rec_lines.append(f"{crop_rel_path.as_posix()}\t{transcription}")
774
+ split_counter["rec_crops"] += 1
775
+
776
+ text_record = {
777
+ "book_title": book,
778
+ "page_index": page_index,
779
+ "image_path": image_rel_path.as_posix(),
780
+ "text_id": original.text_id,
781
+ "original_bbox_xyxy": original.bbox.to_list(),
782
+ "original_quad_clockwise": original.bbox.to_quad(),
783
+ "original_transcript": original.transcript,
784
+ "original_orientation": original.orientation,
785
+ "cv2_candidates": [candidate.to_list() for candidate in cv2_candidates],
786
+ "ctd_matches": [
787
+ {
788
+ "bbox_xyxy": match.bbox.to_list(),
789
+ "quad_clockwise": match.quad,
790
+ "vertical": match.vertical,
791
+ "score": round(match.score, 4),
792
+ "support": round(match.support, 4),
793
+ "line_polygons": match.line_polygons,
794
+ }
795
+ for match in ctd_matches
796
+ ],
797
+ "transcript_segments": transcript_segments,
798
+ "action": action,
799
+ "final_blocks": manifest_blocks,
800
+ }
801
+ text_manifest.write(json.dumps(text_record, ensure_ascii=False) + "\n")
802
+ page_manifest_record["texts"].append(
803
+ {
804
+ "text_id": original.text_id,
805
+ "action": action,
806
+ "original_bbox_xyxy": original.bbox.to_list(),
807
+ "final_blocks": manifest_blocks,
808
+ }
809
+ )
810
+
811
+ det_lines.append(page_label_line(image_rel_path.as_posix(), page_det_entries))
812
+ page_manifest.write(json.dumps(page_manifest_record, ensure_ascii=False) + "\n")
813
+
814
+ (output_root / "det" / f"{split}.txt").write_text("\n".join(det_lines), encoding="utf-8")
815
+ (output_root / "rec" / f"rec_gt_{split}.txt").write_text("\n".join(rec_lines), encoding="utf-8")
816
+ summary["books"][split] = dict(split_counter)
817
+ summary["global"].update(split_counter)
818
+
819
+ summary["global"] = dict(summary["global"])
820
+ (output_root / "stats" / "summary.json").write_text(
821
+ json.dumps(summary, ensure_ascii=False, indent=2),
822
+ encoding="utf-8",
823
+ )
824
+ print(json.dumps(summary, ensure_ascii=False, indent=2))
825
+
826
+
827
+ if __name__ == "__main__":
828
+ main()