donghyun commited on Dec 29, 2025

Commit

8672bad

1 Parent(s): 1a7b7d2

Add OCR code, modules, and weights

Files changed (18) hide show

.gitignore +22 -0
README.md +2 -0
ai_modules/__init__.py +60 -0
ai_modules/models/HRCenterNet.py +204 -0
ai_modules/models/__init__.py +5 -0
ai_modules/models/modules.py +111 -0
ai_modules/models/resnet.py +186 -0
ai_modules/nlp/__init__.py +26 -0
ai_modules/nlp/mlm_predictor.py +118 -0
ai_modules/nlp/punctuation_restorer.py +326 -0
ai_modules/nlp/utils.py +87 -0
ai_modules/nlp_engine.py +321 -0
ai_modules/ocr_engine.py +767 -0
ai_modules/preprocessor_unified.py +605 -0
dong_ocr.py +349 -0
requirements.txt +8 -0
weights/best.pth +3 -0
weights/best_5000.pt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+# Python
+__pycache__/
+*.pyc
+*.pyo
+# Environment & Secrets
+.env
+*.json
+weights/*.json
+# Logs & Temp
+*.log
+*.tmp
+*.temp
+# Output files
+*_bbox.*
+*_ocr_result.json
+# OS files
+.DS_Store
+Thumbs.db

README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

ai_modules/__init__.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+"""
+================================================================================
+Epitext AI Unified Preprocessing Module
+================================================================================
+통합 이미지 전처리 패키지 (Swin Gray + OCR 동시 생성)
+한 번의 함수 호출로 두 가지 전처리 완료:
+  1️⃣  Swin Gray: 그레이 비이진화 (정보 손실 최소) → JPG 3채널
+  2️⃣  OCR: 이진화 (명확한 흑백) → PNG 1채널
+버전: 1.0.0
+상태: ✅ Production Ready
+주요 특징:
+  ✅ 효율성: 영역 검출 1회 (두 가지 모두 사용)
+  ✅ 배경 보장: Swin (밝음) + OCR (하얀색)
+  ✅ 탁본 지원: 자동 검출 옵션
+  ✅ 설정 가능: JSON 기반 커스터마이징
+"""
+from .preprocessor_unified import (
+    UnifiedImagePreprocessor,
+    get_preprocessor,
+    preprocess_image_unified
+)
+from .ocr_engine import (
+    get_ocr_engine,
+    OCREngine,
+    ocr_and_detect
+)
+from .nlp_engine import (
+    get_nlp_engine,
+    NLPEngine,
+    process_text_with_nlp
+)
+__version__ = "1.0.0"
+__author__ = "Epitext Team"
+__all__ = [
+    "UnifiedImagePreprocessor",
+    "get_preprocessor",
+    "preprocess_image_unified",
+    "get_ocr_engine",
+    "OCREngine",
+    "ocr_and_detect",
+    "get_nlp_engine",
+    "NLPEngine",
+    "process_text_with_nlp"
+]

ai_modules/models/HRCenterNet.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import torch
+from torch import nn
+from ai_modules.models.modules import BasicBlock, Bottleneck
+class StageModule(nn.Module):
+    def __init__(self, stage, output_branches, c, bn_momentum):
+        super(StageModule, self).__init__()
+        self.stage = stage
+        self.output_branches = output_branches
+        self.branches = nn.ModuleList()
+        for i in range(self.stage):
+            w = c * (2 ** i)
+            branch = nn.Sequential(
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+                BasicBlock(w, w, bn_momentum=bn_momentum),
+            )
+            self.branches.append(branch)
+        self.fuse_layers = nn.ModuleList()
+        # for each output_branches (i.e. each branch in all cases but the very last one)
+        for i in range(self.output_branches):
+            self.fuse_layers.append(nn.ModuleList())
+            for j in range(self.stage):  # for each branch
+                if i == j:
+                    self.fuse_layers[-1].append(nn.Sequential())  # Used in place of "None" because it is callable
+                elif i < j:
+                    self.fuse_layers[-1].append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(1, 1), stride=(1, 1), bias=False),
+                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
+                        nn.Upsample(scale_factor=(2.0 ** (j - i)), mode='nearest'),
+                    ))
+                elif i > j:
+                    ops = []
+                    for k in range(i - j - 1):
+                        ops.append(nn.Sequential(
+                            nn.Conv2d(c * (2 ** j), c * (2 ** j), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
+                                      bias=False),
+                            nn.BatchNorm2d(c * (2 ** j), eps=1e-05, momentum=0.1, affine=True,
+                                           track_running_stats=True),
+                            nn.ReLU(inplace=True),
+                        ))
+                    ops.append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** j), c * (2 ** i), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),
+                                  bias=False),
+                        nn.BatchNorm2d(c * (2 ** i), eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
+                    ))
+                    self.fuse_layers[-1].append(nn.Sequential(*ops))
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        assert len(self.branches) == len(x)
+        x = [branch(b) for branch, b in zip(self.branches, x)]
+        x_fused = []
+        for i in range(len(self.fuse_layers)):
+            for j in range(0, len(self.branches)):
+                if j == 0:
+                    x_fused.append(self.fuse_layers[i][0](x[0]))
+                else:
+                    x_fused[i] = x_fused[i] + self.fuse_layers[i][j](x[j])
+        for i in range(len(x_fused)):
+            x_fused[i] = self.relu(x_fused[i])
+        return x_fused
+class _HRCenterNet(nn.Module):
+    def __init__(self, c=48, nof_joints=17, bn_momentum=0.1):
+        super(_HRCenterNet, self).__init__()
+        # Input (stem net)
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.bn2 = nn.BatchNorm2d(64, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True)
+        self.relu = nn.ReLU(inplace=True)
+        # Stage 1 (layer1)      - First group of bottleneck (resnet) modules
+        downsample = nn.Sequential(
+            nn.Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False),
+            nn.BatchNorm2d(256, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+        )
+        self.layer1 = nn.Sequential(
+            Bottleneck(64, 64, downsample=downsample),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+            Bottleneck(256, 64),
+        )
+        # Fusion layer 1 (transition1)      - Creation of the first two branches (one full and one half resolution)
+        self.transition1 = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(256, c, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            ),
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(256, c * (2 ** 1), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 1), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),
+        ])
+        # Stage 2 (stage2)      - Second module with 1 group of bottleneck (resnet) modules. This has 2 branches
+        self.stage2 = nn.Sequential(
+            StageModule(stage=2, output_branches=2, c=c, bn_momentum=bn_momentum),
+        )
+        # Fusion layer 2 (transition2)      - Creation of the third branch (1/4 resolution)
+        self.transition2 = nn.ModuleList([
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(c * (2 ** 1), c * (2 ** 2), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 2), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),  # ToDo Why the new branch derives from the "upper" branch only?
+        ])
+        # Stage 3 (stage3)      - Third module with 4 groups of bottleneck (resnet) modules. This has 3 branches
+        self.stage3 = nn.Sequential(
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=3, output_branches=3, c=c, bn_momentum=bn_momentum),
+        )
+        # Fusion layer 3 (transition3)      - Creation of the fourth branch (1/8 resolution)
+        self.transition3 = nn.ModuleList([
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(),  # None,   - Used in place of "None" because it is callable
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(c * (2 ** 2), c * (2 ** 3), kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(c * (2 ** 3), eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),  # ToDo Why the new branch derives from the "upper" branch only?
+        ])
+        # Stage 4 (stage4)      - Fourth module with 3 groups of bottleneck (resnet) modules. This has 4 branches
+        self.stage4 = nn.Sequential(
+            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=4, output_branches=4, c=c, bn_momentum=bn_momentum),
+            StageModule(stage=4, output_branches=1, c=c, bn_momentum=bn_momentum),
+        )
+        # Final layer (final_layer)
+        self.final_layer = nn.Sequential(
+            nn.Conv2d(c, 32, kernel_size=(1, 1), stride=(1, 1)),
+            nn.BatchNorm2d(32, eps=1e-05, momentum=bn_momentum, affine=True, track_running_stats=True),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(32, nof_joints, kernel_size=(1, 1), stride=(1, 1)),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x = [trans(x) for trans in self.transition1]  # Since now, x is a list (# == nof branches)
+        x = self.stage2(x)
+        # x = [trans(x[-1]) for trans in self.transition2]    # New branch derives from the "upper" branch only
+        x = [
+            self.transition2[0](x[0]),
+            self.transition2[1](x[1]),
+            self.transition2[2](x[-1])
+        ]  # New branch derives from the "upper" branch only
+        x = self.stage3(x)
+        # x = [trans(x) for trans in self.transition3]    # New branch derives from the "upper" branch only
+        x = [
+            self.transition3[0](x[0]),
+            self.transition3[1](x[1]),
+            self.transition3[2](x[2]),
+            self.transition3[3](x[-1])
+        ]  # New branch derives from the "upper" branch only
+        x = self.stage4(x)
+        x = self.final_layer(x[0])
+        return x
+def HRCenterNet(args):
+    model = _HRCenterNet(32, 5, 0.1)
+    if not (args.log_dir == None):
+        model.load_state_dict(torch.load(args.log_dir))
+    return model

ai_modules/models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding: utf-8 -*-
+"""
+OCR 모델 모듈 패키지
+"""

ai_modules/models/modules.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+# class Bottleneck_Tranpose(nn.Module):
+#     expansion = 4
+#     def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+#         super(Bottleneck, self).__init__()
+#         nn.ConvTranspose2d(c, 64, (3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1)),
+#         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+#         self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+#         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+#         self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+#         self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+#         self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=bn_momentum)
+#         self.relu = nn.ReLU(inplace=True)
+#         self.downsample = downsample
+#         self.stride = stride
+#     def forward(self, x):
+#         residual = x
+#         out = self.conv1(x)
+#         out = self.bn1(out)
+#         out = self.relu(out)
+#         out = self.conv2(out)
+#         out = self.bn2(out)
+#         out = self.relu(out)
+#         out = self.conv3(out)
+#         out = self.bn3(out)
+#         if self.downsample is not None:
+#             residual = self.downsample(x)
+#         out += residual
+#         out = self.relu(out)
+#         return out
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, bn_momentum=0.1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=bn_momentum)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out

ai_modules/models/resnet.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import PIL
+from torch import nn
+from torchvision import transforms
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+    def forward(self, x):
+        return self._forward_impl(x)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+class ResnetCustom(torch.nn.Module):
+    def __init__(self, weight_fn):
+        super().__init__()
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        weight = torch.load(weight_fn, map_location=self.device)
+        self.id2charDict = weight['vocab']['id2char']
+        num_classes = len(self.id2charDict)
+        self.id2charDict[-1] = "■"  # unrecognized token
+        self.transform = transforms.Compose([transforms.Grayscale(),
+                                             transforms.Resize((64,64)),
+                                             transforms.ToTensor()])
+        self.net = ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
+        self.net.load_state_dict(weight['model'])
+        self.net = self.net.to(self.device)
+        self.net.eval()
+        #self.net(torch.rand((64,1,64,64)))
+        print(f'{weight_fn} loaded!')
+    def forward(self, images:PIL.Image, bs=256, conf_thres=0.5):
+        '''
+        input
+            images: list of PIL images
+        return
+            chars: list of recognized chars
+        '''
+        chars = []
+        for i in range(0, len(images), bs):
+            inp = []
+            for image in images[i: i+bs]:
+                inp.append(self.transform(image))
+            inp = torch.stack(inp, dim=0).to(self.device)
+            out = self.net(inp)
+            out = torch.nn.functional.softmax(out, dim=1)
+            conf, indice = torch.max(out, dim=1)
+            indice[conf<conf_thres] = -1
+            chars += [self.id2charDict[x] for x in indice.tolist()]
+        return chars
+if __name__ == "__main__":
+    net = ResnetCustom(weight_fn="best_5000.pt")
+    inp = [PIL.Image.open('0.jpg'), PIL.Image.open('1.png')]
+    print(net(inp))

ai_modules/nlp/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Korean Historical Text Processor NLP Module
+한국어 고전 텍스트의 구두점 복원 및 MLM 예측을 위한 모듈입니다.
+"""
+__version__ = "1.0.0"
+__author__ = "EPITEXT"
+from .punctuation_restorer import PunctuationRestorer
+from .mlm_predictor import MLMPredictor
+from .utils import (
+    remove_punctuation,
+    extract_mask_info,
+    replace_mask_with_symbol,
+    normalize_mask_tokens,
+)
+__all__ = [
+    "PunctuationRestorer",
+    "MLMPredictor",
+    "remove_punctuation",
+    "extract_mask_info",
+    "replace_mask_with_symbol",
+    "normalize_mask_tokens",
+]

ai_modules/nlp/mlm_predictor.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+MLM(Masked Language Model) 예측 모듈
+BERT 기반 MLM을 사용하여 마스킹된 토큰을 예측합니다.
+"""
+import torch
+from typing import List, Dict
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from .utils import normalize_mask_tokens
+class MLMPredictor:
+    """MLM 예측을 담당하는 클래스"""
+    def __init__(self, config: Dict, device: str = "cpu"):
+        """
+        MLM 예측기를 초기화합니다.
+        Args:
+            config: 설정 딕셔너리 (nlp_config.json에서 로드)
+            device: 연산 디바이스 ('cpu' 또는 'cuda')
+        """
+        mlm_cfg = config['mlm_model']
+        self.model_name = mlm_cfg['model_name']
+        self.top_k = mlm_cfg['top_k']
+        self.max_length = mlm_cfg['max_length']
+        self.device = device
+        self.tokenizer = None
+        self.model = None
+    def load_model(self) -> None:
+        """모델을 메모리에 로드합니다."""
+        print(f"[MLM] 모델 로드 중: {self.model_name}")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            use_fast=False
+        )
+        self.model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+        self.model.to(self.device)
+        self.model.eval()
+        print(f"[MLM] ✓ MLM 모델 로드 완료")
+    def predict_masks(
+        self,
+        text: str
+    ) -> List[List[Dict[str, any]]]:
+        """
+        텍스트 내의 [MASK] 토큰을 예측합니다.
+        Args:
+            text: 마스크가 포함된 텍스트
+        Returns:
+            각 마스크 위치별 top-k 예측 결과 리스트
+        """
+        # [MASK1], [MASK2] -> [MASK] 정규화
+        text_normalized = normalize_mask_tokens(text)
+        print(f"[MLM] 입력 텍스트 샘플: {text_normalized[:100]}...")
+        print(f"[MLM] [MASK] 토큰 개수: {text_normalized.count('[MASK]')}")
+        # 토크나이즈
+        inputs = self.tokenizer(
+            text_normalized,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self.max_length
+        )
+        # 디바이스로 이동
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # [MASK] 위치 찾기
+        mask_indices = torch.where(
+            inputs["input_ids"] == self.tokenizer.mask_token_id
+        )[1]
+        print(f"[MLM] 토크나이저가 찾은 [MASK] 위치 개수: {len(mask_indices)}")
+        if len(mask_indices) == 0:
+            print("[MLM] ⚠️ 경고: [MASK] 토큰을 찾을 수 없습니다!")
+            sample_tokens = self.tokenizer.convert_ids_to_tokens(
+                inputs['input_ids'][0][:50]
+            )
+            print(f"[MLM] 토큰화된 입력 샘플: {sample_tokens}")
+            return []
+        # 예측 수행
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+        # 각 마스크 위치별로 top-k 예측
+        all_predictions = []
+        for mask_idx in mask_indices:
+            mask_logits = logits[0, mask_idx, :]
+            # 전체 어휘에 대해 softmax 계산 후 top-k 선택
+            all_probs = torch.nn.functional.softmax(mask_logits, dim=-1)
+            top_k_probs, top_k_indices = torch.topk(all_probs, self.top_k)
+            top_k_tokens = self.tokenizer.convert_ids_to_tokens(
+                top_k_indices.tolist()
+            )
+            predictions = [
+                {
+                    "token": token,
+                    "probability": float(prob)
+                }
+                for token, prob in zip(top_k_tokens, top_k_probs.tolist())
+            ]
+            all_predictions.append(predictions)
+        return all_predictions

ai_modules/nlp/punctuation_restorer.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+구두점 복원 모듈
+Hugging Face 모델을 사용하여 한국어 고전 텍스트의 구두점을 복원합니다.
+"""
+import json
+import torch
+from pathlib import Path
+from typing import Dict, List, Tuple
+from collections import Counter
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+class PunctuationRestorer:
+    """구두점 복원을 담당하는 클래스"""
+    def __init__(self, config: Dict, cache_dir: str, device: str = "cpu"):
+        """
+        구두점 복원기를 초기화합니다.
+        Args:
+            config: 설정 딕셔너리 (nlp_config.json에서 로드)
+            cache_dir: 모델 캐시 디렉토리 (기본 경로)
+            device: 연산 디바이스 ('cpu' 또는 'cuda')
+        """
+        punc_cfg = config['punc_model']
+        self.model_tag = punc_cfg['model_tag']
+        self.max_length = punc_cfg['max_length']
+        self.window_size = punc_cfg['window_size']
+        self.overlap = punc_cfg['overlap']
+        self.cache_dir = Path(cache_dir) / "punc"
+        self.device = device
+        self.model_info = None
+    def download_model(self) -> None:
+        """Hugging Face에서 모델을 다운로드합니다."""
+        self.cache_dir.parent.mkdir(parents=True, exist_ok=True)
+        if not self.cache_dir.exists() or not any(self.cache_dir.iterdir()):
+            print(f"[PUNC] 모델 다운로드 중: {self.model_tag}")
+            snapshot_download(
+                repo_id=self.model_tag,
+                repo_type="model",
+                local_dir=str(self.cache_dir),
+                local_dir_use_symlinks=False,
+            )
+        else:
+            print(f"[PUNC] 캐시된 모델 사용: {self.cache_dir}")
+    def load_model(self) -> None:
+        """모델을 메모리에 로드합니다."""
+        torch_dtype = torch.float16 if "cuda" in self.device else torch.float32
+        # 모델 파일 찾기
+        fnames = sorted(self.cache_dir.rglob("*.safetensors"))
+        if len(fnames) == 0:
+            # safetensors가 없으면 다른 형식 시도
+            fnames = sorted(self.cache_dir.rglob("*.bin"))
+        if len(fnames) == 0:
+            raise FileNotFoundError(f"모델 파일을 찾을 수 없습니다: {self.cache_dir}")
+        hface_path = fnames[0].parent
+        # 토크나이저 및 모델 로드
+        tokenizer = AutoTokenizer.from_pretrained(
+            str(hface_path),
+            model_max_length=self.max_length
+        )
+        model = AutoModelForTokenClassification.from_pretrained(
+            str(hface_path),
+            device_map=self.device if "cuda" in self.device else None,
+            torch_dtype=torch_dtype
+        )
+        if "cuda" not in self.device:
+            model = model.to(self.device)
+        model.eval()
+        # NER 파이프라인 생성
+        ner_pipeline = pipeline(
+            task="ner",
+            model=model,
+            tokenizer=tokenizer,
+            device=0 if "cuda" in self.device else -1
+        )
+        # 레이블 매핑 로드
+        label2id_path = hface_path / "label2id.json"
+        if not label2id_path.is_file():
+            label2id_path = hface_path.parent / "label2id.json"
+        if not label2id_path.is_file():
+            raise FileNotFoundError(f"label2id.json을 찾을 수 없습니다: {hface_path}")
+        label2id = json.loads(label2id_path.read_text(encoding="utf-8"))
+        self.model_info = {
+            "model": model,
+            "tokenizer": tokenizer,
+            "pipe": ner_pipeline,
+            "label2id": label2id
+        }
+        print(f"[PUNC] ✓ 구두점 복원 모델 로드 완료")
+    def restore_punctuation(
+        self,
+        text: str,
+        add_space: bool = True,
+        reduce: bool = True,
+    ) -> str:
+        """
+        슬라이딩 윈도우 방식으로 구두점을 복원합니다.
+        Args:
+            text: 입력 텍스트
+            add_space: 구두점 뒤 공백 추가 여부
+            reduce: 구두점 단순화 여부
+        Returns:
+            구두점이 복원된 텍스트
+        """
+        if not text.strip():
+            return ""
+        # 레이블 -> 구두점 매핑 생성
+        label2punc = self._build_label2punc(add_space, reduce)
+        # 슬라이딩 윈도우로 레이블 예측
+        labels = self._predict_labels_sliding(text, self.window_size, self.overlap)
+        # 길이 조정
+        if len(labels) < len(text):
+            labels += ["O"] * (len(text) - len(labels))
+        elif len(labels) > len(text):
+            labels = labels[:len(text)]
+        # 구두점 삽입
+        result = ""
+        for ch, label in zip(text, labels):
+            result += ch
+            punc = label2punc.get(label, "")
+            result += punc
+        return result.strip()
+    def _predict_labels_sliding(
+        self,
+        text: str,
+        window_size: int,
+        overlap: int
+    ) -> List[str]:
+        """
+        슬라이딩 윈도우로 각 문자의 레이블을 예측합니다.
+        Args:
+            text: 입력 텍스트
+            window_size: 윈도우 크기
+            overlap: 중첩 크기
+        Returns:
+            각 문자에 대한 레이블 리스트
+        """
+        n = len(text)
+        if n == 0:
+            return []
+        # 각 위치별 후보 레이블 저장
+        labels_per_pos = [[] for _ in range(n)]
+        stride = max(1, window_size - overlap)
+        start = 0
+        while start < n:
+            end = min(start + window_size, n)
+            sub_text = text[start:end]
+            try:
+                # NER 예측 수행
+                sub_preds = self.model_info["pipe"](sub_text)
+                _, sub_labels = self._align_predictions(sub_text, sub_preds)
+            except Exception as e:
+                # 오류 발생 시 모두 'O' 레이블
+                print(f"[PUNC] 예측 오류 (start={start}): {e}")
+                sub_labels = ["O"] * len(sub_text)
+            # 전역 위치에 레이블 저장
+            for i, label in enumerate(sub_labels):
+                gidx = start + i
+                if gidx >= n:
+                    break
+                if label != "O":
+                    labels_per_pos[gidx].append(label)
+            if end == n:
+                break
+            start += stride
+        # 다수결 투표로 최종 레이블 결정
+        final_labels = []
+        for cand_list in labels_per_pos:
+            if not cand_list:
+                final_labels.append("O")
+            else:
+                c = Counter(cand_list)
+                label, _ = c.most_common(1)[0]
+                final_labels.append(label)
+        return final_labels
+    @staticmethod
+    def _align_predictions(text: str, predictions: List[dict]) -> Tuple[List[str], List[str]]:
+        """
+        NER 예측 결과를 문자 단위 레이블로 정렬합니다.
+        Args:
+            text: 원본 텍스트
+            predictions: NER 예측 결과
+        Returns:
+            (문자 리스트, 레이블 리스트) 튜플
+        """
+        words = list(text)
+        labels = ["O" for _ in range(len(words))]
+        for pred in predictions:
+            idx = pred["end"] - 1
+            if 0 <= idx < len(labels):
+                labels[idx] = pred["entity"]
+        return words, labels
+    def _build_label2punc(self, add_space: bool, reduce: bool) -> Dict[str, str]:
+        """
+        레이블을 구두점으로 매핑하는 딕셔너리를 생성합니다.
+        Args:
+            add_space: 구두점 뒤 공백 추가 여부
+            reduce: 구두점 단순화 여부
+        Returns:
+            레이블 -> 구두점 매핑 딕셔너리
+        """
+        label2id = self.model_info["label2id"]
+        label2punc = {f"B-{v}": k for k, v in label2id.items()}
+        label2punc["O"] = ""
+        # 구두점 단순화
+        if reduce:
+            new_label2punc = {}
+            for label, punc in label2punc.items():
+                if label == "O":
+                    new_label2punc[label] = ""
+                else:
+                    reduced = self._reduce_punc(punc)
+                    new_label2punc[label] = reduced
+            label2punc = new_label2punc
+        # 공백 추가
+        if add_space:
+            special_puncs = "!,:;?。"
+            label2punc = {
+                k: self._insert_space(v, special_puncs)
+                for k, v in label2punc.items()
+            }
+            label2punc["O"] = ""
+        return label2punc
+    @staticmethod
+    def _reduce_punc(text: str) -> str:
+        """
+        구두점을 단순화합니다 (?, 。, , 중 하나로 변환).
+        Args:
+            text: 구두점 문자열
+        Returns:
+            단순화된 구두점
+        """
+        reduce_map = {
+            ",": ",", "-": ",", "/": ",", ":": ",", "|": ",",
+            "·": ",", "、": ",",
+            "?": "?", "!": "。", ".": "。", ";": "。", "。": "。",
+        }
+        text = "".join([reduce_map.get(c, "") for c in text])
+        punc_order = "?。,,"
+        if len(set(text).intersection(punc_order)) == 0:
+            return ""
+        # 가장 많이 등장한 구두점 선택
+        counts = {c: text.count(c) for c in punc_order}
+        max_count = max(counts.values())
+        max_keys = {k for k, v in counts.items() if v == max_count}
+        if len(max_keys) == 1:
+            return max_keys.pop()
+        # 동률일 경우 우선순위에 따라 선택
+        for c in punc_order:
+            if c in max_keys:
+                return c
+        return ""
+    @staticmethod
+    def _insert_space(text: str, chars: str) -> str:
+        """
+        특정 문자 뒤에 공백을 삽입합니다.
+        Args:
+            text: 원본 텍스트
+            chars: 공백을 추가할 문자들
+        Returns:
+            공백이 삽입된 텍스트
+        """
+        result = ""
+        for c in text:
+            result += c
+            if c in chars:
+                result += " "
+        return result

ai_modules/nlp/utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+유틸리티 함수 모듈
+파일 입출력, 텍스트 전처리 등 공통 기능을 제공합니다.
+"""
+import re
+import unicodedata
+from typing import Dict, Any
+def remove_punctuation(text: str) -> str:
+    """
+    텍스트에서 구두점과 공백을 제거합니다. [MASK] 토큰은 보존합니다.
+    Args:
+        text: 원본 텍스트
+    Returns:
+        구두점이 제거된 텍스트
+    """
+    result = []
+    i = 0
+    while i < len(text):
+        # [MASK...] 형태의 토큰 보존
+        if text[i:i+1] == '[' and 'MASK' in text[i:i+10]:
+            end = text.find(']', i)
+            if end != -1:
+                result.append(text[i:end+1])
+                i = end + 1
+                continue
+        # 일반 문자 처리 (구두점과 공백 제외)
+        if unicodedata.category(text[i])[0] not in "PZ":
+            result.append(text[i])
+        i += 1
+    return "".join(result)
+def replace_mask_with_symbol(text: str, symbol: str = "□") -> str:
+    """
+    [MASK1], [MASK2] 등의 마스크 토큰을 지정된 기호로 치환합니다.
+    Args:
+        text: 원본 텍스트
+        symbol: 치환할 기호
+    Returns:
+        마스크가 치환된 텍스트
+    """
+    return re.sub(r'\[MASK\d+\]', symbol, text)
+def normalize_mask_tokens(text: str) -> str:
+    """
+    [MASK1], [MASK2] 등을 [MASK]로 정규화합니다.
+    Args:
+        text: 원본 텍스트
+    Returns:
+        정규화된 텍스트
+    """
+    return re.sub(r'\[MASK\d+\]', '[MASK]', text)
+def extract_mask_info(json_data: Dict[str, Any]) -> list:
+    """
+    JSON 데이터에서 마스크 정보를 추출합니다.
+    Args:
+        json_data: 입력 JSON 데이터
+    Returns:
+        마스크 정보 리스트 (order와 type 포함)
+    """
+    mask_info = []
+    for item in json_data.get('results', []):
+        if 'MASK' in item.get('type', ''):
+            mask_info.append({
+                'order': item['order'],
+                'type': item['type']
+            })
+    mask_info.sort(key=lambda x: x['order'])
+    return mask_info

ai_modules/nlp_engine.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""
+NLP 통합 엔진
+구두점 복원 및 MLM 예측을 통합 관리하는 엔진입니다.
+"""
+import os
+import json
+import torch
+import logging
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from .nlp.punctuation_restorer import PunctuationRestorer
+from .nlp.mlm_predictor import MLMPredictor
+from .nlp.utils import remove_punctuation, replace_mask_with_symbol
+logger = logging.getLogger(__name__)
+def load_nlp_config(config_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    NLP 설정 파일을 로드합니다.
+    Args:
+        config_path: 설정 파일 경로 (None이면 기본 경로 사용)
+    Returns:
+        설정 딕셔너리
+    """
+    if config_path is None:
+        config_path = Path(__file__).parent / "config" / "nlp_config.json"
+    else:
+        config_path = Path(config_path)
+    if not config_path.exists():
+        raise FileNotFoundError(f"NLP 설정 파일을 찾을 수 없습니다: {config_path}")
+    with open(config_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+class NLPEngine:
+    """NLP 처리 통합 엔진 클래스"""
+    def __init__(self, config_path: Optional[str] = None):
+        """
+        NLP 엔진을 초기화합니다.
+        Args:
+            config_path: 설정 파일 경로 (None이면 기본 경로 사용)
+        """
+        self.config = load_nlp_config(config_path)
+        # 디바이스 설정
+        dev_cfg = self.config.get('device', 'auto')
+        if dev_cfg == 'auto':
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = dev_cfg
+        logger.info(f"[NLP] Device: {self.device}")
+        # 모델 캐시 경로 (환경 변수 또는 기본값)
+        self.base_model_dir = os.getenv(
+            'AI_MODEL_DIR',
+            str(Path(__file__).parent.parent / "models")
+        )
+        # 서브 모듈 초기화 (지연 로딩)
+        self.punc_restorer = None
+        self.mlm_predictor = None
+    def _load_models(self):
+        """필요할 때 모델을 메모리에 로드"""
+        if self.punc_restorer is None:
+            logger.info("[NLP] 구두점 복원 모델 로드 중...")
+            self.punc_restorer = PunctuationRestorer(
+                self.config,
+                self.base_model_dir,
+                self.device
+            )
+            self.punc_restorer.download_model()
+            self.punc_restorer.load_model()
+        if self.mlm_predictor is None:
+            logger.info("[NLP] MLM 모델 로드 중...")
+            self.mlm_predictor = MLMPredictor(self.config, self.device)
+            self.mlm_predictor.load_model()
+    def process_text(
+        self,
+        raw_text: str,
+        ocr_results: Optional[List[Dict]] = None,
+        add_space: bool = True,
+        reduce_punc: bool = True
+    ) -> Dict[str, Any]:
+        """
+        텍스트 처리 파이프라인:
+        1. 구두점 제거 (전처리)
+        2. 구두점 복원
+        3. [MASK] 예측
+        Args:
+            raw_text: 원본 텍스트 (구두점 포함 가능)
+            add_space: 구두점 뒤 공백 추가 여부
+            reduce_punc: 구두점 단순화 여부
+        Returns:
+            처리 결과 딕셔너리
+        """
+        self._load_models()
+        try:
+            # 1. 전처리 (구두점 제거, [MASK] 보존)
+            clean_text = remove_punctuation(raw_text)
+            logger.info(f"[NLP] 구두점 제거 완료: {len(clean_text)} 글자")
+            # 2. 구두점 복원
+            punctuated_text = self.punc_restorer.restore_punctuation(
+                clean_text,
+                add_space=add_space,
+                reduce=reduce_punc
+            )
+            logger.info(f"[NLP] 구두점 복원 완료: {len(punctuated_text)} 글자")
+            # 3. MLM 예측
+            mask_predictions = self.mlm_predictor.predict_masks(punctuated_text)
+            logger.info(f"[NLP] MLM 예측 완료: {len(mask_predictions)}개 마스크")
+            # 4. 출력용 텍스트 생성 ([MASK] -> □)
+            mask_replacement = self.config['tokens']['mask_replacement']
+            final_text = replace_mask_with_symbol(
+                punctuated_text,
+                mask_replacement
+            )
+            # Extract mask info from OCR results or original text
+            mask_info_list = []
+            if ocr_results:
+                # Use OCR results to get order and type
+                for item in ocr_results:
+                    if 'MASK' in item.get('type', ''):
+                        mask_info_list.append({
+                            'order': item.get('order', 0),
+                            'type': item.get('type', 'MASK2'),
+                            'text': item.get('text', '')
+                        })
+            else:
+                # Fallback: extract from text
+                i = 0
+                while i < len(raw_text):
+                    if raw_text[i] == '[' and 'MASK' in raw_text[i:i+10]:
+                        end = raw_text.find(']', i)
+                        if end != -1:
+                            mask_text = raw_text[i:end+1]
+                            mask_type = 'MASK1' if 'MASK1' in mask_text else 'MASK2'
+                            mask_info_list.append({
+                                'order': len(mask_info_list),  # Sequential order
+                                'type': mask_type,
+                                'text': mask_text
+                            })
+                            i = end + 1
+                            continue
+                    i += 1
+            # Format results according to specification
+            formatted_results = []
+            for idx, pred_list in enumerate(mask_predictions):
+                if idx < len(mask_info_list):
+                    mask_info = mask_info_list[idx]
+                    formatted_results.append({
+                        "order": mask_info['order'],
+                        "type": mask_info['type'],
+                        "top_10": pred_list[:10]  # Top-10 predictions
+                    })
+                else:
+                    # Fallback if mask_info_list is shorter
+                    formatted_results.append({
+                        "order": idx,
+                        "type": "MASK2",
+                        "top_10": pred_list[:10]
+                    })
+            # Calculate statistics
+            top1_probs = [preds[0]['probability'] for preds in mask_predictions if preds]
+            statistics = {
+                "top1_probability_avg": float(sum(top1_probs) / len(top1_probs)) if top1_probs else 0.0,
+                "top1_probability_min": float(min(top1_probs)) if top1_probs else 0.0,
+                "top1_probability_max": float(max(top1_probs)) if top1_probs else 0.0,
+                "total_masks": len(mask_predictions)
+            }
+            return {
+                "punctuated_text_with_masks": final_text,
+                "results": formatted_results,
+                "statistics": statistics
+            }
+        except Exception as e:
+            logger.error(f"[NLP] 처리 중 오류: {e}", exc_info=True)
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    def restore_punctuation_only(
+        self,
+        text: str,
+        add_space: bool = True,
+        reduce_punc: bool = True
+    ) -> Dict[str, Any]:
+        """
+        구두점 복원만 수행합니다 (MLM 예측 제외).
+        Args:
+            text: 입력 텍스트
+            add_space: 구두점 뒤 공백 추가 여부
+            reduce_punc: 구두점 단순화 여부
+        Returns:
+            구두점 복원 결과
+        """
+        self._load_models()
+        try:
+            clean_text = remove_punctuation(text)
+            punctuated_text = self.punc_restorer.restore_punctuation(
+                clean_text,
+                add_space=add_space,
+                reduce=reduce_punc
+            )
+            return {
+                "success": True,
+                "original_text": text,
+                "clean_text": clean_text,
+                "punctuated_text": punctuated_text
+            }
+        except Exception as e:
+            logger.error(f"[NLP] 구두점 복원 중 오류: {e}", exc_info=True)
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    def predict_masks_only(
+        self,
+        text: str
+    ) -> Dict[str, Any]:
+        """
+        MLM 예측만 수행합니다 (구두점 복원 제외).
+        Args:
+            text: 마스크가 포함된 텍스트
+        Returns:
+            MLM 예측 결과
+        """
+        self._load_models()
+        try:
+            mask_predictions = self.mlm_predictor.predict_masks(text)
+            return {
+                "success": True,
+                "predictions": mask_predictions,
+                "mask_count": len(mask_predictions)
+            }
+        except Exception as e:
+            logger.error(f"[NLP] MLM 예측 중 오류: {e}", exc_info=True)
+            return {
+                "success": False,
+                "error": str(e)
+            }
+# ================================================================================
+# Global Accessor
+# ================================================================================
+_nlp_engine = None
+def get_nlp_engine(config_path: Optional[str] = None) -> NLPEngine:
+    """
+    전역 NLP 엔진 인스턴스를 반환합니다 (싱글톤 패턴).
+    Args:
+        config_path: 설정 파일 경��� (None이면 기본 경로 사용)
+    Returns:
+        NLPEngine 인스턴스
+    """
+    global _nlp_engine
+    if _nlp_engine is None:
+        _nlp_engine = NLPEngine(config_path)
+    return _nlp_engine
+def process_text_with_nlp(
+    text: str,
+    ocr_results: Optional[List[Dict]] = None,
+    config_path: Optional[str] = None,
+    add_space: bool = True,
+    reduce_punc: bool = True
+) -> Dict[str, Any]:
+    """
+    편의 함수: 텍스트를 NLP 파이프라인으로 처리합니다.
+    Args:
+        text: 입력 텍스트
+        ocr_results: OCR 결과 리스트 (order, type 정보 포함)
+        config_path: 설정 파일 경로
+        add_space: 구두점 뒤 공백 추가 여부
+        reduce_punc: 구두점 단순화 여부
+    Returns:
+        처리 결과 딕셔너리
+    """
+    engine = get_nlp_engine(config_path)
+    return engine.process_text(text, ocr_results=ocr_results, add_space=add_space, reduce_punc=reduce_punc)

ai_modules/ocr_engine.py ADDED Viewed

	@@ -0,0 +1,767 @@

+# -*- coding: utf-8 -*-
+"""
+================================================================================
+OCR Ensemble Module for Epitext AI Project
+================================================================================
+모듈명: ocr_engine.py (v12.0.0 - Production Ready)
+작성일: 2025-12-03
+목적: Google Vision API + HRCenterNet 앙상블 기반 한자 OCR 및 손상 영역 탐지
+상태: Production Ready
+================================================================================
+"""
+import os
+import sys
+import io
+import cv2
+import json
+import numpy as np
+import torch
+import torchvision
+import re
+import logging
+from torch.autograd import Variable
+from pathlib import Path
+from PIL import Image
+from typing import Dict, List, Optional, Tuple, Any
+# ================================================================================
+# Logging Configuration
+# ================================================================================
+logger = logging.getLogger(__name__)
+# ================================================================================
+# External Model Imports
+# ================================================================================
+try:
+    from ai_modules.models.resnet import ResnetCustom
+    from ai_modules.models.HRCenterNet import _HRCenterNet
+    logger.info("[INIT] 외부 모델 임포트 완료: ResnetCustom, HRCenterNet")
+except ImportError as e:
+    logger.error(f"[INIT] 모델 임포트 실패: {e}")
+    raise
+# ================================================================================
+# Google Vision API Import
+# ================================================================================
+try:
+    from google.cloud import vision
+    HAS_GOOGLE_VISION = True
+except ImportError:
+    HAS_GOOGLE_VISION = False
+    logger.warning("[INIT] google-cloud-vision 패키지가 설치되지 않았습니다.")
+# ================================================================================
+# Utility Functions
+# ================================================================================
+def is_hanja(text: str) -> bool:
+    if not text: return False
+    return re.match(r'[\u4e00-\u9fff]', text) is not None
+def calculate_pixel_density(binary_img: np.ndarray, box: Dict) -> float:
+    x1, y1 = int(box['min_x']), int(box['min_y'])
+    x2, y2 = int(box['max_x']), int(box['max_y'])
+    h, w = binary_img.shape
+    x1, y1 = max(0, x1), max(0, y1)
+    x2, y2 = min(w, x2), min(h, y2)
+    if x2 <= x1 or y2 <= y1: return 0.0
+    roi = binary_img[y1:y2, x1:x2]
+    return cv2.countNonZero(roi) / ((x2 - x1) * (y2 - y1))
+def load_ocr_config(config_path: Optional[str] = None) -> Dict:
+    """설정 파일 로드"""
+    if config_path is None:
+        config_path = str(Path(__file__).parent / "config" / "ocr_config.json")
+    with open(config_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+# ================================================================================
+# Text Detection Class
+# ================================================================================
+class TextDetector:
+    def __init__(self, device: torch.device, det_ckpt: str, config: Dict):
+        self.device = device
+        self.config = config
+        self.input_size = config['model_config']['input_size']
+        self.output_size = config['model_config']['output_size']
+        self.model = _HRCenterNet(32, 5, 0.1)
+        if not os.path.exists(det_ckpt):
+            raise FileNotFoundError(f"체크포인트 파일 없음: {det_ckpt}")
+        state = torch.load(det_ckpt, map_location=self.device)
+        self.model.load_state_dict(state)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        self.transform = torchvision.transforms.Compose([
+            torchvision.transforms.Resize((self.input_size, self.input_size)),
+            torchvision.transforms.ToTensor()
+        ])
+    @torch.no_grad()
+    def detect(self, image) -> Tuple[List, List]:
+        if isinstance(image, str): img = Image.open(image).convert("RGB")
+        elif isinstance(image, np.ndarray): img = Image.fromarray(image).convert("RGB")
+        else: img = image.convert("RGB")
+        image_tensor = self.transform(img).unsqueeze_(0)
+        inp = Variable(image_tensor).to(self.device, dtype=torch.float)
+        predict = self.model(inp)
+        predict_np = predict.data.cpu().numpy()
+        heatmap, offset_y, offset_x, width_map, height_map = predict_np[0]
+        bbox, score_list = [], []
+        Hc, Wc = img.size[1] / self.output_size, img.size[0] / self.output_size
+        # Config에서 NMS 임계값 로드
+        nms_cfg = self.config.get('nms_config', {})
+        nms_score = nms_cfg.get('primary_threshold', 0.12)
+        idxs = np.where(heatmap.reshape(-1, 1) >= nms_score)[0]
+        if len(idxs) == 0:
+            nms_score = nms_cfg.get('fallback_threshold', 0.08)
+            idxs = np.where(heatmap.reshape(-1, 1) >= nms_score)[0]
+        for j in idxs:
+            row = j // self.output_size
+            col = j - row * self.output_size
+            bias_x = offset_x[row, col] * Hc
+            bias_y = offset_y[row, col] * Wc
+            width = width_map[row, col] * self.output_size * Hc
+            height = height_map[row, col] * self.output_size * Wc
+            score_list.append(float(heatmap[row, col]))
+            row = row * Hc + bias_y
+            col = col * Wc + bias_x
+            top = row - width / 2.0
+            left = col - height / 2.0
+            bottom = row + width / 2.0
+            right = col + height / 2.0
+            bbox.append([left, top, max(0.0, right - left), max(0.0, bottom - top)])
+        if not bbox: return [], []
+        xyxy = [[x, y, x+w, y+h] for x, y, w, h in bbox]
+        keep = torchvision.ops.nms(
+            torch.tensor(xyxy, dtype=torch.float32),
+            scores=torch.tensor(score_list, dtype=torch.float32),
+            iou_threshold=nms_cfg.get('iou_threshold', 0.05)
+        ).cpu().numpy().tolist()
+        res_boxes, res_scores = [], []
+        W, H = img.size
+        for k in keep:
+            idx = int(k)
+            x, y, w, h = bbox[idx]
+            x = max(0.0, min(x, W - 1.0))
+            y = max(0.0, min(y, H - 1.0))
+            w = max(0.0, min(w, W - x))
+            h = max(0.0, min(h, H - y))
+            if w > 1 and h > 1:
+                res_boxes.append([x, y, w, h])
+                res_scores.append(score_list[idx])
+        return res_boxes, res_scores
+# ================================================================================
+# Merging Logics (Config 적용)
+# ================================================================================
+def merge_vertical_fragments(boxes, scores, config):
+    if not boxes: return [], []
+    rects = [{'x': b[0], 'y': b[1], 'w': b[2], 'h': b[3],
+              'x2': b[0]+b[2], 'y2': b[1]+b[3],
+              'cx': b[0]+b[2]/2, 'cy': b[1]+b[3]/2, 'score': s}
+             for b, s in zip(boxes, scores)]
+    cfg = config['merge_config']['vertical_fragments']
+    while True:
+        rects.sort(key=lambda r: r['y'])
+        merged = False
+        new_rects, skip_indices = [], set()
+        for i in range(len(rects)):
+            if i in skip_indices: continue
+            current = rects[i]
+            best_cand_idx = -1
+            for j in range(i + 1, min(i + 5, len(rects))):
+                if j in skip_indices: continue
+                candidate = rects[j]
+                avg_w = (current['w'] + candidate['w']) / 2
+                if abs(current['cx'] - candidate['cx']) > avg_w * cfg['horizontal_center_ratio']: continue
+                if (candidate['y'] - current['y2']) > avg_w * cfg['vertical_gap_ratio']: continue
+                new_h = max(current['y2'], candidate['y2']) - min(current['y'], candidate['y'])
+                new_w = max(current['x2'], candidate['x2']) - min(current['x'], candidate['x'])
+                is_safe_ratio = (new_h / new_w) < cfg['aspect_ratio_limit']
+                cur_square = (current['h'] / current['w']) > 0.85
+                cand_square = (candidate['h'] / candidate['w']) > 0.85
+                is_overlapped = (candidate['y'] - current['y2']) < -avg_w * 0.2
+                if is_safe_ratio and (not (cur_square and cand_square) or is_overlapped):
+                    best_cand_idx = j
+                    break
+            if best_cand_idx != -1:
+                cand = rects[best_cand_idx]
+                nx, ny = min(current['x'], cand['x']), min(current['y'], cand['y'])
+                nx2, ny2 = max(current['x2'], cand['x2']), max(current['y2'], cand['y2'])
+                new_rects.append({
+                    'x': nx, 'y': ny, 'w': nx2-nx, 'h': ny2-ny,
+                    'x2': nx2, 'y2': ny2, 'cx': (nx+nx2)/2, 'cy': (ny+ny2)/2,
+                    'score': max(current['score'], cand['score'])
+                })
+                skip_indices.add(best_cand_idx)
+                merged = True
+            else:
+                new_rects.append(current)
+        rects = new_rects
+        if not merged: break
+    return [[r['x'], r['y'], r['w'], r['h']] for r in rects], [r['score'] for r in rects]
+def merge_google_symbols(symbols, config):
+    if not symbols: return []
+    cfg = config['merge_config']['google_symbols']
+    while True:
+        symbols.sort(key=lambda s: s['min_y'])
+        merged = False
+        new_symbols, skip_indices = [], set()
+        for i in range(len(symbols)):
+            if i in skip_indices: continue
+            curr = symbols[i]
+            best_cand_idx = -1
+            for j in range(i + 1, min(i + 5, len(symbols))):
+                if j in skip_indices: continue
+                cand = symbols[j]
+                avg_w = (curr['width'] + cand['width']) / 2
+                if abs(curr['center_x'] - cand['center_x']) > avg_w * cfg['horizontal_center_ratio']: continue
+                gap = cand['min_y'] - curr['max_y']
+                is_touching = gap < (avg_w * cfg['vertical_gap_ratio'])
+                new_h = max(curr['max_y'], cand['max_y']) - min(curr['min_y'], cand['min_y'])
+                new_w = max(curr['max_x'], cand['max_x']) - min(curr['min_x'], cand['min_x'])
+                is_both_square = (curr['height']/curr['width'] > 0.85) and (cand['height']/cand['width'] > 0.85)
+                is_safe_ratio = (new_h / new_w) < cfg['aspect_ratio_limit']
+                is_duplicate = (curr['text'] == cand['text'])
+                if (is_touching and is_safe_ratio and not is_both_square) or is_duplicate:
+                    best_cand_idx = j
+                    break
+            if best_cand_idx != -1:
+                cand = symbols[best_cand_idx]
+                merged_sym = {
+                    'text': curr['text'],
+                    'min_x': min(curr['min_x'], cand['min_x']), 'min_y': min(curr['min_y'], cand['min_y']),
+                    'max_x': max(curr['max_x'], cand['max_x']), 'max_y': max(curr['max_y'], cand['max_y']),
+                    'confidence': max(curr['confidence'], cand['confidence']),
+                    'source': 'Google'
+                }
+                merged_sym['width'] = merged_sym['max_x'] - merged_sym['min_x']
+                merged_sym['height'] = merged_sym['max_y'] - merged_sym['min_y']
+                merged_sym['center_x'] = (merged_sym['min_x'] + merged_sym['max_x']) / 2
+                merged_sym['center_y'] = (merged_sym['min_y'] + merged_sym['max_y']) / 2
+                new_symbols.append(merged_sym)
+                skip_indices.add(best_cand_idx)
+                merged = True
+            else:
+                new_symbols.append(curr)
+        symbols = new_symbols
+        if not merged: break
+    return symbols
+# ================================================================================
+# Models Execution
+# ================================================================================
+def get_google_ocr(content: bytes, config: Dict, google_json_path: Optional[str] = None) -> List[Dict]:
+    if not HAS_GOOGLE_VISION: return []
+    if google_json_path and os.path.exists(google_json_path):
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_json_path
+    try:
+        client = vision.ImageAnnotatorClient()
+        image = vision.Image(content=content)
+        context = vision.ImageContext(language_hints=["zh-Hant"])
+        response = client.document_text_detection(image=image, image_context=context)
+        if not response.full_text_annotation: return []
+        symbols = []
+        for page in response.full_text_annotation.pages:
+            for block in page.blocks:
+                for paragraph in block.paragraphs:
+                    for word in paragraph.words:
+                        for s in word.symbols:
+                            if not is_hanja(s.text): continue
+                            v = s.bounding_box.vertices
+                            x, y = [p.x for p in v], [p.y for p in v]
+                            symbols.append({
+                                'text': s.text,
+                                'center_x': (min(x)+max(x))/2, 'center_y': (min(y)+max(y))/2,
+                                'min_x': min(x), 'max_x': max(x), 'min_y': min(y), 'max_y': max(y),
+                                'width': max(x)-min(x), 'height': max(y)-min(y),
+                                'confidence': s.confidence, 'source': 'Google'
+                            })
+        original_count = len(symbols)
+        symbols = merge_google_symbols(symbols, config)
+        if len(symbols) < original_count:
+            logger.info(f"[OCR] Google 병합: {original_count} -> {len(symbols)}개")
+        return symbols
+    except Exception as e:
+        logger.error(f"[OCR] Google Vision Error: {e}")
+        return []
+def get_custom_model_ocr(image_path, binary_img, detector, recognizer, config):
+    try:
+        pil_img = Image.open(image_path).convert("RGB")
+        boxes, scores = detector.detect(pil_img)
+        if not boxes: return []
+        # Merge
+        original_count = len(boxes)
+        boxes, scores = merge_vertical_fragments(boxes, scores, config)
+        if len(boxes) < original_count:
+            logger.info(f"[OCR] Custom 병합: {original_count} -> {len(boxes)}개")
+        # Stats
+        all_heights = [b[3] for b in boxes]
+        all_widths = [b[2] for b in boxes]
+        median_h = np.median(all_heights) if all_heights else 0
+        median_w = np.median(all_widths) if all_widths else 0
+        # Recognize
+        crops = [pil_img.crop((int(b[0]), int(b[1]), int(b[0]+b[2]), int(b[1]+b[3]))) for b in boxes]
+        chars = recognizer(crops) if crops else []
+        # Filter & Mask (Config values)
+        symbols = []
+        img_h, _ = binary_img.shape
+        ft = config['filtering_thresholds']
+        it = config['ink_detection_thresholds']
+        for char, (x, y, w, h), score in zip(chars, boxes, scores):
+            if not char or char == "■": continue
+            box_dict = {'min_x': x, 'min_y': y, 'max_x': x+w, 'max_y': y+h}
+            density = calculate_pixel_density(binary_img, box_dict)
+            # Hard Filters
+            if score < ft['min_score_hard'] or density < ft['density_min_hard']: continue
+            # Smart Filters
+            if score < ft['smart_score_threshold'] and density < ft['smart_density_threshold']: continue
+            # Title Removal
+            is_huge = (h > median_h * 3.5) if median_h > 0 else False
+            is_top = (y < img_h * 0.15) and (h > median_h * 2.5 or w > median_w * 2.5) if median_h > 0 else False
+            if median_h > 0 and (is_huge or is_top): continue
+            # Masking
+            final_text, final_type = char, 'TEXT'
+            if density >= it['density_ink_heavy']:
+                final_text, final_type = '[MASK1]', 'MASK1'
+            elif density >= it['density_ink_partial']:
+                final_text, final_type = '[MASK2]', 'MASK2'
+            else:
+                if not is_hanja(char): continue
+            symbols.append({
+                'text': final_text, 'type': final_type,
+                'center_x': x+w/2, 'center_y': y+h/2,
+                'min_x': x, 'max_x': x+w, 'min_y': y, 'max_y': y+h,
+                'width': w, 'height': h,
+                'confidence': float(score), 'source': 'Custom', 'density': density
+            })
+        logger.info(f"[OCR] Custom Model 완료: {len(symbols)}개")
+        return symbols
+    except Exception as e:
+        logger.error(f"[OCR] Custom Model Error: {e}")
+        return []
+# ================================================================================
+# Ensemble Reconstruction (Full Logic from Script)
+# ================================================================================
+def ensemble_reconstruction(google_syms, custom_syms, binary_img, config):
+    logger.info("[ENSEMBLE] 앙상블 재구성 시작...")
+    img_h, img_w = binary_img.shape
+    ec = config['ensemble_config']
+    ft = config['filtering_thresholds']
+    it = config['ink_detection_thresholds']
+    # --- Helper Functions ---
+    def filter_excessive_masks(nodes):
+        filtered, buffer = [], []
+        threshold = ec['excessive_mask_threshold']
+        for node in nodes:
+            if 'MASK' in node.get('type', 'TEXT'): buffer.append(node)
+            else:
+                if buffer:
+                    if len(buffer) < threshold: filtered.extend(buffer)
+                    buffer = []
+                filtered.append(node)
+        if buffer and len(buffer) < threshold: filtered.extend(buffer)
+        return filtered
+    def merge_split_masks(nodes, avg_h):
+        if not nodes: return []
+        merged, skip = [], False
+        for i in range(len(nodes)):
+            if skip: skip = False; continue
+            curr = nodes[i]
+            if i == len(nodes)-1: merged.append(curr); break
+            next_node = nodes[i+1]
+            if 'MASK' in curr.get('type','TEXT') and 'MASK' in next_node.get('type','TEXT'):
+                combined_h = next_node['max_y'] - curr['min_y']
+                if combined_h < avg_h * 1.8:
+                    new_node = curr.copy()
+                    new_node.update({'max_y': next_node['max_y'], 'height': next_node['max_y'] - curr['min_y']})
+                    density = calculate_pixel_density(binary_img, new_node)
+                    new_node['density'] = density
+                    if density < ft['density_min_hard']:
+                        skip = True; continue
+                    m_type = 'MASK1' if density >= it['density_ink_heavy'] else 'MASK2'
+                    new_node.update({'type': m_type, 'text': f'[{m_type}]'})
+                    merged.append(new_node)
+                    skip = True
+                    continue
+            merged.append(curr)
+        return merged
+    def resolve_overlaps(boxes):
+        if not boxes: return []
+        boxes.sort(key=lambda x: x['min_y'])
+        for i in range(len(boxes)-1):
+            curr, next_box = boxes[i], boxes[i+1]
+            if min(curr['max_x'], next_box['max_x']) - max(curr['min_x'], next_box['min_x']) <= 0: continue
+            if curr['max_y'] > next_box['min_y']:
+                mid_y = (curr['max_y'] + next_box['min_y']) / 2
+                curr['max_y'], curr['height'] = mid_y, mid_y - curr['min_y']
+                next_box['min_y'], next_box['height'] = mid_y, next_box['max_y'] - mid_y
+        return boxes
+    def filter_google_overlaps(g_boxes, c_boxes):
+        if not g_boxes: return c_boxes
+        filtered = []
+        for c in c_boxes:
+            is_dup = False
+            for g in g_boxes:
+                dx = abs(c['center_x'] - g['center_x'])
+                dy = abs(c['center_y'] - g['center_y'])
+                # MASK is preserved even if overlapping
+                if 'MASK' in c.get('type', 'TEXT'): pass
+                elif (min(c['max_x'], g['max_x']) > max(c['min_x'], g['min_x']) and
+                      min(c['max_y'], g['max_y']) > max(c['min_y'], g['min_y'])) or \
+                     (dx < g['width']*0.4 and dy < g['height']*0.4):
+                    is_dup = True; break
+            if not is_dup: filtered.append(c)
+        return filtered
+    def infer_gaps(col, step_y, avg_w):
+        if not col: return []
+        col.sort(key=lambda s: s['center_y'])
+        filled = []
+        for i, curr in enumerate(col):
+            if i > 0:
+                prev = col[i-1]
+                gap = curr['center_y'] - prev['center_y']
+                if gap > step_y * ec['gap_inference_ratio']:
+                    missing = int(round(gap/step_y)) - 1
+                    if missing > 0:
+                        step = gap / (missing + 1)
+                        for k in range(1, missing + 1):
+                            ny = prev['center_y'] + k*step
+                            nb = {'min_x': curr['center_x'] - avg_w/2, 'max_x': curr['center_x'] + avg_w/2,
+                                  'min_y': max(0, ny - step_y*0.4), 'max_y': min(img_h, ny + step_y*0.4)}
+                            nb.update({'height': nb['max_y']-nb['min_y'], 'width': nb['max_x']-nb['min_x'],
+                                       'center_x': (nb['min_x']+nb['max_x'])/2, 'center_y': (nb['min_y']+nb['max_y'])/2})
+                            d = calculate_pixel_density(binary_img, nb)
+                            if d < ft['density_min_hard']: continue
+                            mt = 'MASK1' if d >= it['density_ink_heavy'] else 'MASK2'
+                            nb.update({'text': f'[{mt}]', 'type': mt, 'density': d, 'confidence': 0.0, 'source': 'Inferred'})
+                            filled.append(nb)
+            filled.append(curr)
+        return filled
+    def check_ink_on_google(g_syms):
+        filtered = []
+        for s in g_syms:
+            d = calculate_pixel_density(binary_img, s)
+            s['density'] = d
+            if d >= it['density_ink_heavy']: s.update({'type': 'MASK1', 'text': '[MASK1]'})
+            elif d >= it['density_ink_partial']: s.update({'type': 'MASK2', 'text': '[MASK2]'})
+            elif d < ft['density_min_hard']: continue # Hallucination check
+            else: s['type'] = 'TEXT'
+            filtered.append(s)
+        return filtered
+    # --- Preprocessing ---
+    all_h = ([s['height'] for s in google_syms] + [s['height'] for s in custom_syms])
+    median_h = np.median(all_h) if all_h else 30.0
+    # Filter Height & Check Ink
+    def global_remove_tall_and_top(boxes, median_h, threshold=2.0):
+        if not boxes: return []
+        filtered = []
+        for b in boxes:
+            if b['height'] > median_h * threshold: continue
+            if b['min_y'] < img_h * 0.15 and b['height'] > median_h * 2.5: continue
+            filtered.append(b)
+        return filtered
+    if google_syms:
+        google_syms = global_remove_tall_and_top(google_syms, median_h, threshold=2.0)
+        google_syms = check_ink_on_google(google_syms)
+    if custom_syms:
+        custom_syms = global_remove_tall_and_top(custom_syms, median_h, threshold=3.5)
+    # Resize & Filter Custom
+    avg_w = np.mean([s['width'] for s in google_syms]) if google_syms else 0
+    median_w = np.median([s['width'] for s in google_syms]) if google_syms else 0
+    processed_custom = []
+    for s in custom_syms:
+        if 'MASK' in s.get('type', 'TEXT'):
+            processed_custom.append(s); continue
+        if (s['width']*s['height'] > (median_w*median_h)*0.2 and
+            s['width'] > median_w*0.3 and s['height'] > median_h*0.3):
+            # Resize logic
+            if s['width'] < median_w*0.8 or s['height'] < median_h*0.8:
+                tw = max(s['width'], median_w*0.9)
+                th = max(s['height'], median_h*0.9)
+                cx, cy = s['center_x'], s['center_y']
+                s.update({'min_x': max(0, cx-tw/2), 'max_x': min(img_w, cx+tw/2),
+                          'min_y': max(0, cy-th/2), 'max_y': min(img_h, cy+th/2)})
+                s.update({'width': s['max_x']-s['min_x'], 'height': s['max_y']-s['min_y']})
+            processed_custom.append(s)
+    custom_syms = filter_google_overlaps(google_syms, processed_custom)
+    if not google_syms and not custom_syms: return [], []
+    # --- Column Grouping ---
+    all_syms = google_syms + custom_syms
+    columns = []
+    if all_syms:
+        for s in sorted(all_syms, key=lambda x: -x['center_x']):
+            found = False
+            for col in columns:
+                cx = sum(c['center_x'] for c in col) / len(col)
+                if abs(s['center_x'] - cx) < (avg_w if avg_w else s['width']) * ec['column_grouping_ratio']:
+                    col.append(s); found = True; break
+            if not found: columns.append([s])
+    # Vertical Step Calculation
+    global_steps = []
+    for col in columns:
+        col.sort(key=lambda s: s['center_y'])
+        for k in range(len(col)-1):
+            step = col[k+1]['center_y'] - col[k]['center_y']
+            if median_h * 0.8 < step < median_h * 1.5: global_steps.append(step)
+    global_step = np.median(global_steps) if global_steps else median_h * 1.1
+    # --- Reconstruction ---
+    final_boxes, lines = [], []
+    for col in columns:
+        col.sort(key=lambda s: s['center_y'])
+        local_steps = [col[k+1]['center_y'] - col[k]['center_y'] for k in range(len(col)-1)
+                       if median_h*0.8 < (col[k+1]['center_y'] - col[k]['center_y']) < median_h*1.5]
+        step_y = np.median(local_steps) if local_steps else global_step
+        # Deduplication in column
+        unique_col = []
+        if col:
+            prev = col[0]
+            unique_col.append(prev)
+            for k in range(1, len(col)):
+                curr = col[k]
+                dist_y = abs(curr['center_y'] - prev['center_y'])
+                is_same_text = (curr.get('text') == prev.get('text'))
+                is_close = (dist_y < median_h * 0.6)
+                if is_close:
+                    prev_is_mask = 'MASK' in prev.get('type', 'TEXT')
+                    curr_is_mask = 'MASK' in curr.get('type', 'TEXT')
+                    if prev_is_mask and curr_is_mask:
+                        if prev['density'] < curr['density']:
+                            unique_col.pop()
+                            unique_col.append(curr)
+                            prev = curr
+                        continue
+                    elif prev_is_mask and not curr_is_mask:
+                        continue
+                    elif not prev_is_mask and curr_is_mask:
+                        unique_col.pop()
+                        unique_col.append(curr)
+                        prev = curr
+                        continue
+                if is_same_text and is_close:
+                    if prev.get('source') == 'Google':
+                        continue
+                    elif curr.get('source') == 'Google':
+                        unique_col.pop()
+                        unique_col.append(curr)
+                        prev = curr
+                    else:
+                        continue
+                else:
+                    unique_col.append(curr)
+                    prev = curr
+        col = infer_gaps(unique_col, step_y, avg_w if avg_w else median_h)
+        # Gap Filling with Masks
+        filled_col, cy = [], col[0]['min_y'] if col else 0
+        for item in col:
+            gap = item['min_y'] - cy
+            if gap > step_y * 1.2:
+                mb = {'min_x': item['center_x'] - (avg_w if avg_w else median_h)/2,
+                      'max_x': item['center_x'] + (avg_w if avg_w else median_h)/2,
+                      'min_y': max(0, cy + gap*0.1), 'max_y': min(img_h, item['min_y'] - gap*0.1)}
+                d = calculate_pixel_density(binary_img, mb)
+                if d >= ft['density_min_hard']:
+                    mt = 'MASK1' if d >= it['density_ink_heavy'] else 'MASK2'
+                    if d >= it['density_ink_partial']:
+                        filled_col.append({'text': f'[{mt}]', 'type': mt, 'density': d,
+                                           'min_x': mb['min_x'], 'max_x': mb['max_x'],
+                                           'min_y': mb['min_y'], 'max_y': mb['max_y'],
+                                           'confidence': 0.0, 'source': 'GapFill'})
+            if item.get('density', 0) < ft['density_min_hard'] and 'MASK' not in item.get('type','TEXT'):
+                cy = item['max_y']; continue
+            filled_col.append(item)
+            cy = item['max_y']
+        filled_col = merge_split_masks(filled_col, median_h)
+        filled_col = filter_excessive_masks(filled_col)
+        filled_col = resolve_overlaps(filled_col)
+        final_boxes.extend(filled_col)
+        lines.append("".join([s['text'] for s in filled_col]))
+    logger.info(f"[ENSEMBLE] 완료: {len(final_boxes)}개 박스, {len(lines)}개 열")
+    return final_boxes, lines
+# ================================================================================
+# OCREngine Class
+# ================================================================================
+class OCREngine:
+    def __init__(self, config_path: Optional[str] = None):
+        self.config = load_ocr_config(config_path)
+        # Load paths from env
+        base_path = os.getenv('OCR_WEIGHTS_BASE_PATH')
+        if not base_path:
+            raise ValueError("OCR_WEIGHTS_BASE_PATH environment variable is required. Please set it in your .env file.")
+        self.det_ckpt = os.path.join(base_path, os.getenv('OCR_DETECTION_MODEL', 'best.pth'))
+        self.rec_ckpt = os.path.join(base_path, os.getenv('OCR_RECOGNITION_MODEL', 'best_5000.pt'))
+        self.google_json = os.path.join(base_path, os.getenv('GOOGLE_CREDENTIALS_JSON'))
+        if not self.google_json or not os.path.exists(self.google_json):
+            raise ValueError(f"GOOGLE_CREDENTIALS_JSON environment variable is required and file must exist. Please set it in your .env file.")
+        if os.path.exists(self.google_json):
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.google_json
+        # Device
+        dev_cfg = self.config['model_config']['device']
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if dev_cfg == 'auto' else torch.device(dev_cfg)
+        self.detector = None
+        self.recognizer = None
+    def _load_models(self):
+        if not self.detector:
+            self.detector = TextDetector(self.device, self.det_ckpt, self.config)
+        if not self.recognizer:
+            self.recognizer = ResnetCustom(weight_fn=self.rec_ckpt)
+            self.recognizer.to(self.device)
+    def run_ocr(self, image_path: str) -> Dict:
+        try:
+            self._load_models()
+            # 1. Preprocessing (Exact Match to v12 Script)
+            img_bgr = cv2.imread(image_path)
+            if img_bgr is None: raise ValueError(f"Image not found: {image_path}")
+            img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            img_blur = cv2.medianBlur(img_gray, 3)
+            _, img_binary = cv2.threshold(img_blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+            img_binary = cv2.morphologyEx(img_binary, cv2.MORPH_CLOSE, kernel)
+            # 2. Google Vision
+            with io.open(image_path, 'rb') as f: content = f.read()
+            google_syms = get_google_ocr(content, self.config, self.google_json)
+            # 3. Custom Model
+            custom_syms = get_custom_model_ocr(image_path, img_binary, self.detector, self.recognizer, self.config)
+            # 4. Ensemble
+            final_boxes, result_lines = ensemble_reconstruction(google_syms, custom_syms, img_binary, self.config)
+            # Format results according to specification
+            formatted_results = []
+            for order, box in enumerate(final_boxes):
+                formatted_results.append({
+                    "order": order,
+                    "text": box.get('text', ''),
+                    "type": box.get('type', 'TEXT'),
+                    "box": [
+                        float(box.get('min_x', 0)),
+                        float(box.get('min_y', 0)),
+                        float(box.get('max_x', 0)),
+                        float(box.get('max_y', 0))
+                    ],
+                    "confidence": float(box.get('confidence', 0.0)),
+                    "source": box.get('source', 'Unknown')
+                })
+            # Extract image filename
+            image_filename = os.path.basename(image_path)
+            return {
+                "image": image_filename,
+                "results": formatted_results
+            }
+        except Exception as e:
+            logger.error(f"[OCR] Execution Failed: {e}", exc_info=True)
+            return {"success": False, "error": str(e)}
+# ================================================================================
+# Global Accessor
+# ================================================================================
+_engine = None
+def get_ocr_engine(config_path: Optional[str] = None) -> OCREngine:
+    global _engine
+    if _engine is None: _engine = OCREngine(config_path)
+    return _engine
+def ocr_and_detect(image_path: str, config_path: Optional[str] = None, bbox: Optional[Tuple[int, int, int, int]] = None, device: str = "cuda") -> Dict:
+    return get_ocr_engine(config_path).run_ocr(image_path)

ai_modules/preprocessor_unified.py ADDED Viewed

	@@ -0,0 +1,605 @@

+# Epitext_Back/ai_modules/preprocessor_unified.py
+# -*- coding: utf-8 -*-
+"""
+================================================================================
+Unified Image Preprocessing Module for Epitext AI Project
+================================================================================
+모듈명: preprocessor_unified.py (v1.0.0 - Production Ready)
+작성일: 2025-12-02
+목적: 한자 이미지를 Swin Gray와 OCR용으로 동시에 전처리
+상태: Production Ready
+핵심 기능:
+    한 번에 두 가지 전처리 완료:
+    1. Swin Gray: 그레이 비이진화 -> 3채널 (정보 손실 최소)
+    2. OCR: 이진화 -> 1채널 (명확한 흑백)
+    자동 배경 보장:
+    - Swin: 밝은배경 (>=127)
+    - OCR: 흰배경 + 검정글자 (255/0)
+    탁본 자동 검출: 큰 어두운 영역 식별
+    영역 검출 1회: 효율성
+    설정 파일 지원: JSON 기반 커스터마이징
+    로깅 지원: DEBUG, INFO, WARNING, ERROR
+의존성:
+    - opencv-python >= 4.8.0
+    - numpy >= 1.24.0
+단일 함수:
+    preprocess_image_unified(input_path, output_swin_path, output_ocr_path, ...)
+사용 예시:
+    >>> from ai_modules.preprocessor_unified import preprocess_image_unified
+    >>> result = preprocess_image_unified(
+    ...     "input.jpg",
+    ...     "swin.jpg",
+    ...     "ocr.png"
+    ... )
+================================================================================
+"""
+import cv2
+import numpy as np
+from pathlib import Path
+import json
+import logging
+from typing import Dict, Optional, Tuple
+# ================================================================================
+# Logging Configuration
+# ================================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - [%(levelname)s] %(message)s'
+)
+logger = logging.getLogger(__name__)
+# ================================================================================
+# Constants
+# ================================================================================
+# 기본 설정값
+DEFAULT_MARGIN = 10
+DEFAULT_BRIGHTNESS_THRESHOLD = 127
+DEFAULT_RUBBING_MIN_AREA_RATIO = 0.1
+DEFAULT_TEXT_MIN_AREA = 16
+DEFAULT_TEXT_AREA_RATIO = 0.00005
+DEFAULT_MORPHOLOGY_KERNEL_SIZE = (2, 2)
+DEFAULT_MORPHOLOGY_CLOSE_ITERATIONS = 3
+DEFAULT_MORPHOLOGY_OPEN_ITERATIONS = 2
+DEFAULT_RUBBING_KERNEL_SIZE = (5, 5)
+DEFAULT_RUBBING_CLOSE_ITERATIONS = 10
+DEFAULT_RUBBING_OPEN_ITERATIONS = 5
+# ================================================================================
+# Main Preprocessing Class
+# ================================================================================
+class UnifiedImagePreprocessor:
+    """
+    통합 이미지 전처리 클래스 (Swin + OCR)
+    한 번의 처리로 Swin Gray와 OCR용 이미지를 모두 생성합니다.
+    Attributes:
+        config (dict): 전처리 설정 파라미터
+    Example:
+        >>> prep = UnifiedImagePreprocessor()
+        >>> result = prep.preprocess_unified("input.jpg", "swin.jpg", "ocr.png")
+    """
+    def __init__(self, config_path: Optional[str] = None) -> None:
+        """
+        UnifiedImagePreprocessor 초기화
+        Args:
+            config_path (str, optional): 설정 파일 경로 (JSON)
+        """
+        self.config = self._load_config(config_path)
+        logger.info("[INIT] UnifiedImagePreprocessor v1.0.0 초기화 완료")
+    def _load_config(self, config_path: Optional[str]) -> Dict:
+        """설정 파일 로드"""
+        default_config = {
+            "margin": DEFAULT_MARGIN,
+            "brightness_threshold": DEFAULT_BRIGHTNESS_THRESHOLD,
+            "rubbing_min_area_ratio": DEFAULT_RUBBING_MIN_AREA_RATIO,
+            "text_min_area": DEFAULT_TEXT_MIN_AREA,
+            "text_area_ratio": DEFAULT_TEXT_AREA_RATIO,
+            "morphology_kernel_size": DEFAULT_MORPHOLOGY_KERNEL_SIZE,
+            "morphology_close_iterations": DEFAULT_MORPHOLOGY_CLOSE_ITERATIONS,
+            "morphology_open_iterations": DEFAULT_MORPHOLOGY_OPEN_ITERATIONS,
+            "rubbing_kernel_size": DEFAULT_RUBBING_KERNEL_SIZE,
+            "rubbing_close_iterations": DEFAULT_RUBBING_CLOSE_ITERATIONS,
+            "rubbing_open_iterations": DEFAULT_RUBBING_OPEN_ITERATIONS,
+        }
+        # 기본 설정 파일 경로 (config_path가 없을 때)
+        if config_path is None:
+            default_config_path = Path(__file__).parent / "config" / "preprocess_config.json"
+            if default_config_path.exists():
+                config_path = str(default_config_path)
+        if config_path and Path(config_path).exists():
+            try:
+                with open(config_path, 'r', encoding='utf-8') as f:
+                    user_config = json.load(f)
+                    # _description 필드는 제외하고 업데이트
+                    user_config_clean = {k: v for k, v in user_config.items() if not k.startswith('_')}
+                    default_config.update(user_config_clean)
+                    logger.info(f"[CONFIG] 설정 파일 로드: {config_path}")
+            except Exception as e:
+                logger.warning(f"[CONFIG] 설정 파일 로드 실패: {e} - 기본 설정 사용")
+        return default_config
+    def _find_rubbing_bbox(self, gray_image: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
+        """
+        탁본 영역 검출 (큰 어두운 사각형 찾기)
+        Args:
+            gray_image (np.ndarray): 그레이스케일 이미지
+        Returns:
+            tuple: (x, y, w, h) 또는 None
+        """
+        H_img, W_img = gray_image.shape
+        # Step 1: 어두운 영역 추출
+        _, dark_mask = cv2.threshold(gray_image, 127, 255, cv2.THRESH_BINARY_INV)
+        # Step 2: 모폴로지 연산
+        kernel_rub = np.ones(self.config["rubbing_kernel_size"], np.uint8)
+        dark_mask = cv2.morphologyEx(
+            dark_mask, cv2.MORPH_CLOSE, kernel_rub,
+            iterations=self.config["rubbing_close_iterations"]
+        )
+        dark_mask = cv2.morphologyEx(
+            dark_mask, cv2.MORPH_OPEN, kernel_rub,
+            iterations=self.config["rubbing_open_iterations"]
+        )
+        # Step 3: 컨투어 검출
+        contours, _ = cv2.findContours(dark_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        if not contours:
+            return None
+        # Step 4: 가장 큰 컨투어
+        largest = max(contours, key=cv2.contourArea)
+        area = cv2.contourArea(largest)
+        # Step 5: 면적 검증
+        min_area = (H_img * W_img) * self.config["rubbing_min_area_ratio"]
+        if area < min_area:
+            return None
+        return cv2.boundingRect(largest)
+    def _find_text_bbox(self, gray_image: np.ndarray) -> Tuple[int, int, int, int]:
+        """
+        텍스트 영역 검출
+        Args:
+            gray_image (np.ndarray): 그레이스케일 이미지
+        Returns:
+            tuple: (x, y, w, h)
+        """
+        H_img, W_img = gray_image.shape
+        # Step 1: Otsu 이진화
+        _, binary = cv2.threshold(
+            gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+        )
+        # Step 2: 모폴로지 연산
+        kernel_morph = np.ones(self.config["morphology_kernel_size"], np.uint8)
+        binary = cv2.morphologyEx(
+            binary, cv2.MORPH_CLOSE, kernel_morph,
+            iterations=self.config["morphology_close_iterations"]
+        )
+        binary = cv2.morphologyEx(
+            binary, cv2.MORPH_OPEN, kernel_morph,
+            iterations=self.config["morphology_open_iterations"]
+        )
+        # Step 3: 컨투어 검출
+        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        # Step 4: 최소 면적 설정
+        min_area = max(
+            self.config["text_min_area"],
+            int((H_img * W_img) * self.config["text_area_ratio"])
+        )
+        # Step 5: 유효한 컨투어 필터링
+        valid_contours = [
+            cnt for cnt in contours
+            if cv2.contourArea(cv2.boundingRect(cnt)) >= min_area
+        ]
+        # Step 6: 경계박스 계산
+        if valid_contours:
+            all_points = np.vstack(valid_contours)
+            return cv2.boundingRect(all_points)
+        else:
+            return (0, 0, W_img, H_img)
+    def _apply_margin(
+        self,
+        bbox: Tuple[int, int, int, int],
+        gray_image: np.ndarray,
+        margin_val: int
+    ) -> Tuple[int, int, int, int]:
+        """여백 추가"""
+        x, y, w, h = bbox
+        H_img, W_img = gray_image.shape
+        x_new = max(0, x - margin_val)
+        y_new = max(0, y - margin_val)
+        w_new = min(W_img - x_new, w + 2 * margin_val)
+        h_new = min(H_img - y_new, h + 2 * margin_val)
+        return (x_new, y_new, w_new, h_new)
+    def _ensure_bright_background(
+        self,
+        gray_cropped: np.ndarray
+    ) -> Tuple[np.ndarray, Dict]:
+        """
+        밝은배경 보장 (Swin용)
+        Returns:
+            tuple: (처리된 그레이 이미지, 처리 정보)
+        """
+        mean_brightness = np.mean(gray_cropped)
+        is_inverted = False
+        if mean_brightness < self.config["brightness_threshold"]:
+            gray_bright = cv2.bitwise_not(gray_cropped)
+            is_inverted = True
+        else:
+            gray_bright = gray_cropped.copy()
+        # 재확인
+        final_brightness = np.mean(gray_bright)
+        if final_brightness < self.config["brightness_threshold"]:
+            gray_bright = cv2.bitwise_not(gray_bright)
+            is_inverted = not is_inverted
+            final_brightness = np.mean(gray_bright)
+        return gray_bright, {
+            "mean_brightness_before": float(mean_brightness),
+            "mean_brightness_after": float(final_brightness),
+            "is_inverted": is_inverted,
+            "is_bright_bg": final_brightness >= self.config["brightness_threshold"]
+        }
+    def _ensure_white_background(
+        self,
+        gray_cropped: np.ndarray
+    ) -> Tuple[np.ndarray, Dict]:
+        """
+        흰배경 보장 (OCR용)
+        Returns:
+            tuple: (처리된 이진 이미지, 처리 정보)
+        """
+        # Step 1: 이진화
+        _, binary = cv2.threshold(
+            gray_cropped, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
+        )
+        # Step 2: 폴라리티 판단
+        mean_brightness = np.mean(binary)
+        # Step 3: 필요시 반전
+        if mean_brightness < self.config["brightness_threshold"]:
+            binary_final = cv2.bitwise_not(binary)
+            polarity = "inverted"
+        else:
+            binary_final = binary
+            polarity = "normal"
+        final_brightness = np.mean(binary_final)
+        return binary_final, {
+            "mean_brightness_before": float(mean_brightness),
+            "mean_brightness_after": float(final_brightness),
+            "polarity": polarity,
+            "is_white_bg": final_brightness > self.config["brightness_threshold"]
+        }
+    def preprocess_unified(
+        self,
+        input_image_path: str,
+        output_swin_path: str,
+        output_ocr_path: str,
+        margin: Optional[int] = None,
+        use_rubbing: bool = False
+    ) -> Dict:
+        """
+        통합 전처리 (Swin Gray + OCR 동시 생성)
+        한 번의 함수 호출로 Swin Gray와 OCR용 이미지를 모두 생성합니다.
+        탁본 및 텍스트 영역 검출은 1회만 수행되어 효율성을 보장합니다.
+        Args:
+            input_image_path (str): 입력 이미지 경로
+            output_swin_path (str): Swin Gray 출력 경로 (JPG)
+            output_ocr_path (str): OCR 출력 경로 (PNG)
+            margin (int, optional): 크롭 여백 (픽셀)
+            use_rubbing (bool): 탁본 검출 여부 (기본: False)
+        Returns:
+            dict: 처리 결과
+                성공 시: {
+                    "success": True,
+                    "original_shape": (H, W, C),
+                    "bbox": (x, y, w, h),
+                    "region_type": "text" or "rubbing",
+                    "region_detected": bool,
+                    "swin": {
+                        "output_path": str,
+                        "output_shape": (H, W, 3),
+                        "is_bright_bg": bool,
+                        ...
+                    },
+                    "ocr": {
+                        "output_path": str,
+                        "output_shape": (H, W),
+                        "is_white_bg": bool,
+                        ...
+                    }
+                }
+                실패 시: {
+                    "success": False,
+                    "message": str
+                }
+        Processing Steps:
+            1. 이미지 로드
+            2. 그레이스케일 변환
+            3. 영역 검출 (탁본 또는 텍스트, 1회만)
+            4. 크롭 + 여백
+            5. Swin Gray 처리 (밝은배경 보장)
+            6. OCR 처리 (이진화 + 흰배경 보장)
+            7. 동시 저장
+        Output:
+            - Swin: JPG 3채널 (비이진화 256단계)
+            - OCR: PNG 1채널 (이진화)
+        Example:
+            >>> prep = UnifiedImagePreprocessor()
+            >>> result = prep.preprocess_unified(
+            ...     "input.jpg",
+            ...     "swin.jpg",
+            ...     "ocr.png"
+            ... )
+            >>> if result["success"]:
+            ...     swin_output = result["swin"]["output_path"]
+            ...     ocr_output = result["ocr"]["output_path"]
+        """
+        margin_val = margin or self.config["margin"]
+        try:
+            # ====================================================================
+            # Step 1: 이미지 로드
+            # ====================================================================
+            img_bgr = cv2.imread(str(input_image_path), cv2.IMREAD_COLOR)
+            if img_bgr is None:
+                raise ValueError(f"이미지 로드 실패: {input_image_path}")
+            original_shape = img_bgr.shape
+            logger.info(f"[LOAD] 이미지 로드: {input_image_path} {original_shape}")
+            # ====================================================================
+            # Step 2: 그레이스케일 변환
+            # ====================================================================
+            gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            # ====================================================================
+            # Step 3: 영역 검출 (탁본 또는 텍스트)
+            # ====================================================================
+            if use_rubbing:
+                detected_bbox = self._find_rubbing_bbox(gray)
+                region_type = "rubbing"
+                logger.info("[DETECT] 탁본 영역 검출 모드")
+            else:
+                detected_bbox = None
+                region_type = "text"
+                logger.info("[DETECT] 텍스트 영역 검출 모드")
+            H_img, W_img = gray.shape
+            # ====================================================================
+            # Step 4: 크롭 + 여백
+            # ====================================================================
+            if detected_bbox is not None:
+                bbox_final = self._apply_margin(detected_bbox, gray, margin_val)
+                logger.info(f"[DETECT] {region_type} 영역 검출: {bbox_final}")
+            else:
+                # 탁본 미검출 또는 텍스트 모드 -> 텍스트 검출
+                if use_rubbing:
+                    bbox_final = (0, 0, W_img, H_img)
+                    logger.warning("[DETECT] 탁본 미검출 - 전체 이미지 사용")
+                else:
+                    bbox_text = self._find_text_bbox(gray)
+                    bbox_final = self._apply_margin(bbox_text, gray, margin_val)
+                    logger.info(f"[DETECT] 텍스트 영역 검출: {bbox_final}")
+            x, y, w, h = bbox_final
+            gray_cropped = gray[y:y+h, x:x+w]
+            logger.info(f"[CROP] 크롭 완료: {gray_cropped.shape}")
+            # ====================================================================
+            # Step 5: Swin Gray 처리
+            # ====================================================================
+            gray_bright, info_swin = self._ensure_bright_background(gray_cropped)
+            swin_output_3ch = cv2.cvtColor(gray_bright, cv2.COLOR_GRAY2BGR)
+            # ====================================================================
+            # Step 6: OCR 처리
+            # ====================================================================
+            binary_final, info_ocr = self._ensure_white_background(gray_cropped)
+            # ====================================================================
+            # Step 7: 동시 저장
+            # ====================================================================
+            output_swin_path_obj = Path(output_swin_path)
+            output_swin_path_obj.parent.mkdir(parents=True, exist_ok=True)
+            swin_success = cv2.imwrite(str(output_swin_path_obj), swin_output_3ch)
+            output_ocr_path_obj = Path(output_ocr_path)
+            output_ocr_path_obj.parent.mkdir(parents=True, exist_ok=True)
+            ocr_success = cv2.imwrite(str(output_ocr_path_obj), binary_final)
+            if not swin_success or not ocr_success:
+                raise ValueError("이미지 저장 실패")
+            logger.info(f"[SAVE] Swin 저장: {output_swin_path_obj}")
+            logger.info(f"[SAVE] OCR 저장: {output_ocr_path_obj}")
+            # ====================================================================
+            # 결과 반환
+            # ====================================================================
+            return {
+                "success": True,
+                "version": "Unified Swin Gray + OCR (v1.0.0)",
+                "original_shape": original_shape,
+                "bbox": bbox_final,
+                "region_type": region_type,
+                "region_detected": detected_bbox is not None,
+                # Swin 부분
+                "swin": {
+                    "output_path": str(output_swin_path_obj).replace("\\", "/"),
+                    "output_shape": swin_output_3ch.shape,
+                    "color_type": "Grayscale 3채널 (B=G=R, 비이진화 256단계)",
+                    "is_inverted": info_swin["is_inverted"],
+                    "mean_brightness_before": info_swin["mean_brightness_before"],
+                    "mean_brightness_after": info_swin["mean_brightness_after"],
+                    "is_bright_bg": info_swin["is_bright_bg"]
+                },
+                # OCR 부분
+                "ocr": {
+                    "output_path": str(output_ocr_path_obj).replace("\\", "/"),
+                    "output_shape": binary_final.shape,
+                    "polarity": info_ocr["polarity"],
+                    "mean_brightness_before": info_ocr["mean_brightness_before"],
+                    "mean_brightness_after": info_ocr["mean_brightness_after"],
+                    "is_white_bg": info_ocr["is_white_bg"]
+                },
+                "message": "[DONE] 통합 전처리 완료 (Swin + OCR)"
+            }
+        except Exception as e:
+            logger.error(f"[ERROR] 통합 전처리 실패: {e}")
+            return {
+                "success": False,
+                "message": str(e)
+            }
+# ================================================================================
+# Global Instance & Convenience Functions
+# ================================================================================
+_global_preprocessor = None
+def get_preprocessor(config_path: Optional[str] = None) -> UnifiedImagePreprocessor:
+    """전역 전처리기 인스턴스 반환"""
+    global _global_preprocessor
+    if _global_preprocessor is None:
+        _global_preprocessor = UnifiedImagePreprocessor(config_path)
+    return _global_preprocessor
+def preprocess_image_unified(
+    input_path: str,
+    output_swin_path: str,
+    output_ocr_path: str,
+    margin: Optional[int] = None,
+    use_rubbing: bool = False
+) -> Dict:
+    """
+    편의 함수: 통합 전처리
+    Args:
+        input_path (str): 입력 이미지 경로
+        output_swin_path (str): Swin 출력 경로
+        output_ocr_path (str): OCR 출력 경로
+        margin (int, optional): 여백
+        use_rubbing (bool): 탁본 모드
+    Returns:
+        dict: 처리 결과
+    """
+    prep = get_preprocessor()
+    return prep.preprocess_unified(
+        input_path,
+        output_swin_path,
+        output_ocr_path,
+        margin,
+        use_rubbing
+    )
+# ================================================================================
+# Usage Example
+# ================================================================================
+if __name__ == "__main__":
+    """
+    테스트 예시
+    """
+    logger.info("=" * 80)
+    logger.info("[TEST] Unified Image Preprocessor v1.0.0 - 테스트 시작")
+    logger.info("=" * 80)
+    try:
+        prep = UnifiedImagePreprocessor()
+        result = prep.preprocess_unified(
+            "test_input.jpg",
+            "test_swin.jpg",
+            "test_ocr.png"
+        )
+        if result["success"]:
+            logger.info("[TEST] 통합 전처리 성공!")
+            logger.info(f"[TEST] Swin: {result['swin']['output_path']}")
+            logger.info(f"[TEST] OCR:  {result['ocr']['output_path']}")
+            logger.info(f"[TEST] Swin 밝은배경: {'Yes' if result['swin']['is_bright_bg'] else 'No'}")
+            logger.info(f"[TEST] OCR 흰배경:   {'Yes' if result['ocr']['is_white_bg'] else 'No'}")
+        else:
+            logger.error(f"[TEST] 실패: {result['message']}")
+    except Exception as e:
+        logger.error(f"[TEST] 예외: {e}")
+    logger.info("=" * 80)

dong_ocr.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# -*- coding: utf-8 -*-
+"""
+독립 실행 가능한 OCR 스크립트
+Google Vision API + HRCenterNet 앙상블 기반 한자 OCR 및 손상 영역 탐지
+수정사항:
+1. 좌표(X값) 변화를 감지하여 자동으로 열(Column)을 구분하여 출력하는 로직 추가
+2. [MASK] 좌표 등 소수점 영역 손실 방지를 위한 Safe Crop(내림/올림) 적용
+   -> 시각화 뿐만 아니라 JSON 결과 데이터 자체에도 적용하여 소수점 제거
+"""
+import os
+import sys
+import json
+import logging
+import cv2
+import math
+import numpy as np
+from pathlib import Path
+from dotenv import load_dotenv
+# 현재 스크립트의 디렉토리를 Python 경로에 추가
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.insert(0, current_dir)
+# 환경 변수 로드
+load_dotenv()
+# 로깅 설정
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - [%(levelname)s] %(message)s'
+)
+logger = logging.getLogger("DONG_OCR")
+# OCR 엔진 및 전처리 모듈 import
+try:
+    from ai_modules.ocr_engine import get_ocr_engine
+    from ai_modules.preprocessor_unified import preprocess_image_unified
+except ImportError as e:
+    logger.error(f"❌ 모듈 import 실패: {e}")
+    sys.exit(1)
+def format_ocr_results(raw_results, image_filename):
+    """
+    OCR 결과를 요청하신 JSON 포맷으로 변환하는 함수
+    수정: JSON 저장 시에도 Safe Crop(내림/올림)을 적용하여 정수로 변환
+    """
+    formatted_list = []
+    if raw_results is None:
+        raw_results = []
+    if not raw_results:
+        return {"image": image_filename, "results": []}
+    order_counter = 0
+    for idx, item in enumerate(raw_results):
+        if not isinstance(item, dict): continue
+        min_x, min_y, max_x, max_y = 0.0, 0.0, 0.0, 0.0
+        # 1. 이미 'box' 리스트가 있는 경우
+        if 'box' in item and isinstance(item['box'], list) and len(item['box']) == 4:
+            try:
+                min_x, min_y, max_x, max_y = map(float, item['box'])
+            except: pass
+        # 2. 'box'가 없으면 개별 좌표 키 사용
+        if min_x == 0 and max_x == 0:
+            mx = item.get('min_x')
+            my = item.get('min_y')
+            Mx = item.get('max_x')
+            My = item.get('max_y')
+            if mx is None: mx = item.get('x', 0)
+            if my is None: my = item.get('y', 0)
+            if Mx is None:
+                Mx = item.get('x2')
+                if Mx is None:
+                    width = item.get('width', 0)
+                    Mx = mx + width if width > 0 else 0
+            if My is None:
+                My = item.get('y2')
+                if My is None:
+                    height = item.get('height', 0)
+                    My = my + height if height > 0 else 0
+            try:
+                min_x, min_y, max_x, max_y = float(mx), float(my), float(Mx), float(My)
+            except: continue
+        if min_x == 0 and min_y == 0 and max_x == 0 and max_y == 0:
+            width = item.get('width', 0)
+            height = item.get('height', 0)
+            if width > 0 and height > 0:
+                cx, cy = item.get('center_x', width/2), item.get('center_y', height/2)
+                min_x, min_y = cx - width/2, cy - height/2
+                max_x, max_y = cx + width/2, cy + height/2
+            else: continue
+        if max_x <= min_x or max_y <= min_y: continue
+        # === [추가됨] JSON 데이터 자체에 Safe Crop 적용 (소수점 제거) ===
+        # min 좌표는 내림(floor), max 좌표는 올림(ceil)하여 영역 확보 후 정수 변환
+        min_x = int(math.floor(min_x))
+        min_y = int(math.floor(min_y))
+        max_x = int(math.ceil(max_x))
+        max_y = int(math.ceil(max_y))
+        # 음수 좌표 방지 (최소 0)
+        min_x = max(0, min_x)
+        min_y = max(0, min_y)
+        # ==========================================================
+        new_item = {
+            "order": order_counter,
+            "text": item.get('text', ''),
+            "type": item.get('type', 'TEXT'),
+            "box": [min_x, min_y, max_x, max_y],
+            "confidence": float(item.get('confidence', 0.0)),
+            "source": item.get('source', 'Unknown')
+        }
+        formatted_list.append(new_item)
+        order_counter += 1
+    return {"image": image_filename, "results": formatted_list}
+def draw_bboxes(image_path, results, output_path):
+    """이미지에 Bounding Box 그리기 (Safe Crop 적용)"""
+    try:
+        img_array = np.fromfile(image_path, np.uint8)
+        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+        if img is None:
+            img = cv2.imread(image_path)
+            if img is None: return
+        box_count = 0
+        colors = {
+            'Google': (0, 255, 0), 'Custom': (255, 0, 255),
+            'MASK1': (255, 0, 0), 'MASK2': (0, 0, 255), 'Default': (0, 255, 255)
+        }
+        for item in results:
+            box = item.get('box', [])
+            if len(box) != 4: continue
+            try:
+                # format_ocr_results에서 이미 정수로 변환되어 오지만,
+                # 안전을 위해 한 번 더 처리 (float로 들어와도 처리 가능하도록 유지)
+                x1 = int(math.floor(float(box[0])))
+                y1 = int(math.floor(float(box[1])))
+                x2 = int(math.ceil(float(box[2])))
+                y2 = int(math.ceil(float(box[3])))
+            except: continue
+            h, w = img.shape[:2]
+            # 이미지 범위 벗어나지 않게 클리핑
+            x1 = max(0, min(x1, w-1))
+            y1 = max(0, min(y1, h-1))
+            x2 = max(x1+1, min(x2, w))
+            y2 = max(y1+1, min(y2, h))
+            text = item.get('text', '')
+            source = item.get('source', '')
+            itype = item.get('type', 'TEXT')
+            if 'MASK1' in itype or '[MASK1]' in text: color = colors['MASK1']
+            elif 'MASK2' in itype or '[MASK2]' in text: color = colors['MASK2']
+            elif source in colors: color = colors[source]
+            else: color = colors['Default']
+            cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
+            if itype == 'TEXT' and len(text) <= 2:
+                cv2.putText(img, text, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
+            elif 'MASK' in itype:
+                label = '[M1]' if itype == 'MASK1' else '[M2]'
+                cv2.putText(img, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
+            box_count += 1
+        ext = os.path.splitext(output_path)[1].lower()
+        params = [int(cv2.IMWRITE_JPEG_QUALITY), 95] if ext in ['.jpg', '.jpeg'] else [int(cv2.IMWRITE_PNG_COMPRESSION), 3]
+        result, encoded_img = cv2.imencode(ext, img, params)
+        if result:
+            with open(output_path, mode='wb') as f: encoded_img.tofile(f)
+            logger.info(f"🖼️  B-Box 이미지 저장됨: {output_path} ({box_count}개 박스)")
+    except Exception as e:
+        logger.error(f"❌ 시각화 중 오류: {e}")
+def run_ocr(image_path, use_preprocessing=True):
+    """OCR 실행, 결과 출력 및 저장"""
+    if not os.path.exists(image_path):
+        logger.error(f"❌ 이미지 없음: {image_path}")
+        return False
+    logger.info(f"🚀 OCR 분석 시작: {image_path}")
+    try:
+        # 1. 전처리
+        ocr_image_path = image_path
+        preprocess_result = {'success': False}
+        if use_preprocessing:
+            logger.info("📸 이미지 전처리 중...")
+            base_dir = os.path.dirname(os.path.abspath(image_path))
+            base_name = os.path.splitext(os.path.basename(image_path))[0]
+            swin_path = os.path.join(base_dir, f"{base_name}_swin_temp.jpg")
+            ocr_preprocessed_path = os.path.join(base_dir, f"{base_name}_ocr_temp.png")
+            preprocess_result = preprocess_image_unified(
+                input_path=image_path, output_swin_path=swin_path,
+                output_ocr_path=ocr_preprocessed_path, use_rubbing=True
+            )
+            if preprocess_result.get('success'):
+                ocr_image_path = ocr_preprocessed_path
+                logger.info(f"✅ 전처리 완료: {ocr_preprocessed_path}")
+            else:
+                logger.warning(f"⚠️ 전처리 실패: {preprocess_result.get('message')}")
+        # 2. 엔진 실행
+        engine = get_ocr_engine()
+        logger.info("✅ OCR 엔진 로드 완료")
+        try:
+            raw_result = engine.run_ocr(ocr_image_path)
+        except Exception as e:
+            logger.error(f"❌ OCR 실행 예외: {e}")
+            return False
+        if not raw_result: return False
+        is_success = raw_result.get('success', False)
+        if not is_success and 'results' in raw_result and isinstance(raw_result['results'], list):
+            is_success = True
+        if not is_success:
+            logger.error(f"❌ OCR 실패: {raw_result.get('error')}")
+            return False
+        logger.info("\n" + "="*60)
+        logger.info("✅ OCR 분석 완료")
+        # 3. 데이터 포맷팅
+        formatted_result = format_ocr_results(raw_result.get('results', []), os.path.basename(image_path))
+        results_list = formatted_result.get('results', [])
+        # 4. [열 구분 출력 로직] 좌표 기반으로 열을 계산하여 출력
+        logger.info("\n" + "📜 [ 인식된 텍스트 결과 (자동 열 구분) ] " + "-"*25)
+        if not results_list:
+            logger.info("  (결과 없음)")
+        else:
+            columns = []
+            current_col_text = []
+            # 첫 번째 글자의 X 중심점 계산
+            first_box = results_list[0]['box']
+            prev_cx = (first_box[0] + first_box[2]) / 2
+            for item in results_list:
+                box = item['box']
+                curr_cx = (box[0] + box[2]) / 2
+                # 텍스트 추출 (MASK 처리)
+                text = item.get('text', '')
+                if item.get('type') in ['MASK1', 'MASK2']:
+                    text = f"[{item.get('type')}]"
+                # === 열 구분 핵심 로직 ===
+                # 이전 글자와 X좌표 중심이 50픽셀 이상 차이나면 새로운 열로 간주
+                # (일반적으로 세로쓰기에서 줄바꿈 시 X좌표가 크게 변함)
+                if abs(curr_cx - prev_cx) > 50:
+                    if current_col_text:
+                        columns.append("".join(current_col_text))
+                    current_col_text = []
+                    prev_cx = curr_cx # 새로운 열의 기준으로 갱신
+                current_col_text.append(text)
+                # 같은 열 내에서는 미세한 X 흔들림이 있을 수 있으므로 prev_cx를 계속 갱신하지 않고
+                # 해당 열의 '대표' X값을 유지하거나, 혹은 글자마다 갱신할 수 있음.
+                # 여기서는 글자가 비스듬할 수 있으므로 매번 갱신하는 방식을 씀
+                prev_cx = curr_cx
+            # 마지막 열 추가
+            if current_col_text:
+                columns.append("".join(current_col_text))
+            # 출력
+            for idx, col_text in enumerate(columns, 1):
+                logger.info(f"  [열 {idx:02d}] {col_text}")
+        logger.info("-" * 60 + "\n")
+        # 5. 결과 저장
+        json_path = os.path.splitext(image_path)[0] + "_ocr_result.json"
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(formatted_result, f, ensure_ascii=False, indent=2)
+        logger.info(f"💾 JSON 결과 저장됨: {json_path}")
+        # 6. 시각화 저장
+        output_img_path = os.path.splitext(image_path)[0] + "_bbox.jpg"
+        bbox_image_path = ocr_image_path if use_preprocessing and preprocess_result.get('success') else image_path
+        draw_bboxes(bbox_image_path, results_list, output_img_path)
+        # 7. 통계
+        counts = {'Google':0, 'Custom':0, 'MASK1':0, 'MASK2':0, 'TEXT':0}
+        for r in results_list:
+            if r['source'] in counts: counts[r['source']] += 1
+            if r['type'] in counts: counts[r['type']] += 1
+        logger.info("📊 최종 통계")
+        logger.info(f"  - 🟢 Google: {counts['Google']}개")
+        logger.info(f"  - 🟣 Custom: {counts['Custom']}개")
+        logger.info(f"  - 🔵 MASK1: {counts['MASK1']}개")
+        logger.info(f"  - 🔴 MASK2: {counts['MASK2']}개")
+        logger.info(f"  - 📝 TEXT: {counts['TEXT']}개")
+        logger.info("="*60)
+        return True
+    except Exception as e:
+        logger.error(f"❌ 오류 발생: {e}", exc_info=True)
+        return False
+def main():
+    if len(sys.argv) < 2:
+        print("사용법: python dong_ocr.py <이미지>")
+        sys.exit(1)
+    if not os.getenv('OCR_WEIGHTS_BASE_PATH') or not os.getenv('GOOGLE_CREDENTIALS_JSON'):
+        logger.error("❌ 환경변수 미설정")
+        sys.exit(1)
+    if run_ocr(sys.argv[1]):
+        logger.info("✅ 작업 완료!")
+        sys.exit(0)
+    else:
+        logger.error("❌ 작업 실패")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+opencv-python
+numpy
+torch
+torchvision
+python-dotenv
+Pillow
+google-cloud-vision
+huggingface-hub>=0.34.0,<1.0

weights/best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31db0eec96515dc820475df31245d7fe51ffcc56c76dc70df4e2bf83ff21d7e6
+size 115004284

weights/best_5000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3180e120cb82a5dc2474448a3b61ea72b53e2507a2dc367bda76dc222a35ec6
+size 62505977