File size: 11,342 Bytes
aa3fdef
 
 
 
 
e82864c
 
aa3fdef
e82864c
 
aa3fdef
e82864c
 
 
 
aa3fdef
 
 
 
 
 
e82864c
aa3fdef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e82864c
aa3fdef
 
e82864c
aa3fdef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e82864c
aa3fdef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e82864c
 
 
 
 
 
 
 
aa3fdef
 
 
 
 
 
 
 
 
e82864c
aa3fdef
e82864c
 
aa3fdef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e82864c
aa3fdef
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# -*- coding: utf-8 -*-
"""
OCR Tools - Advanced text extraction with multi-language support
Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
"""

import io
import re
from typing import Any, Dict, List, Optional

import numpy as np
from PIL import Image
import pytesseract
from deep_translator import GoogleTranslator

# Try to import optional dependencies
try:
    import cv2
    HAS_CV2 = True
except ImportError:
    HAS_CV2 = False

try:
    from langdetect import detect
    HAS_LANGDETECT = True
except ImportError:
    HAS_LANGDETECT = False

try:
    from paddleocr import PaddleOCR
    HAS_PADDLEOCR = True
    _paddle_ocr = None
except ImportError:
    HAS_PADDLEOCR = False
    _paddle_ocr = None


# Language code mapping
LANG_CODE_MAP = {
    'zh-cn': 'zh-CN',
    'zh-tw': 'zh-TW',
    'en': 'en',
    'ja': 'ja',
    'ko': 'ko',
    'fr': 'fr',
    'de': 'de',
    'es': 'es',
    'ru': 'ru',
}

# Tesseract language codes for each supported language
TESSERACT_LANG_MAP = {
    'en': 'eng',
    'english': 'eng',
    'zh-cn': 'chi_sim',
    'chinese': 'chi_sim',
    'zh-tw': 'chi_tra',
    'ja': 'jpn',
    'japanese': 'jpn',
    'ko': 'kor',
    'korean': 'kor',
    'de': 'deu',
    'german': 'deu',
    'es': 'spa',
    'spanish': 'spa',
    'ru': 'rus',
    'russian': 'rus',
    'fr': 'fra',
    'french': 'fra',
}


def _get_paddle_ocr():
    """Lazily initialize PaddleOCR"""
    global _paddle_ocr
    if HAS_PADDLEOCR and _paddle_ocr is None:
        try:
            _paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
        except Exception as e:
            print(f"[OCR] PaddleOCR init failed: {e}")
    return _paddle_ocr


def filter_pinyin_keep_chinese(text: str) -> str:
    """
    Filter out pinyin and keep only Chinese characters.
    Preserves complete sentences with Chinese characters.
    """
    lines = text.split('\n')
    filtered_lines = []

    for line in lines:
        line_stripped = line.strip()
        if not line_stripped:
            continue

        # Check if line contains Chinese characters
        has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))

        # Check if line is pure pinyin
        is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))

        if is_pinyin:
            continue

        if has_chinese:
            chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
            if chinese_parts:
                filtered_lines.append(''.join(chinese_parts))

    return '\n'.join(filtered_lines)


def detect_language_from_text(text: str) -> str:
    """Detect language, with special handling for Chinese characters"""
    has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
    if has_chinese:
        return 'zh-cn'

    has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
    if has_japanese:
        return 'ja'

    has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
    if has_korean:
        return 'ko'

    if HAS_LANGDETECT:
        try:
            return detect(text)
        except:
            pass

    return 'en'


def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
    """Apply image preprocessing for better OCR accuracy"""
    if not HAS_CV2:
        return img_array

    # Convert to grayscale if needed
    if len(img_array.shape) == 3:
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = img_array

    if method == 'simple':
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary
    elif method == 'adaptive':
        return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    elif method == 'clahe':
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary
    elif method == 'denoised':
        kernel = np.ones((2, 2), np.uint8)
        denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary
    elif method == 'advanced':
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
        return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    else:
        return gray


def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
    """Use PaddleOCR for text extraction (best for Chinese)"""
    paddle = _get_paddle_ocr()
    if paddle is None:
        return None, 0

    try:
        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        img_array = np.array(img)

        result = paddle.ocr(img_array, cls=True)

        if not result or len(result) == 0 or result[0] is None:
            return None, 0

        texts = []
        scores = []
        for line in result[0]:
            if line and len(line) >= 2:
                text_info = line[1]
                if isinstance(text_info, tuple) and len(text_info) >= 2:
                    texts.append(text_info[0])
                    scores.append(text_info[1])

        if not texts:
            return None, 0

        full_text = '\n'.join(texts)
        avg_confidence = sum(scores) / len(scores) if scores else 0

        return full_text, avg_confidence * 100

    except Exception as e:
        print(f"[OCR] PaddleOCR error: {e}")
        return None, 0


def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
    """Use Tesseract with multiple preprocessing methods"""
    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    img_array = np.array(img)

    best_text = ""
    best_confidence = 0
    best_method = ""

    # Try different preprocessing methods
    methods = ['simple', 'adaptive', 'clahe', 'denoised']
    if HAS_CV2:
        methods.append('advanced')

    for method in methods:
        try:
            if HAS_CV2:
                processed = _preprocess_image(img_array, method)
                processed_img = Image.fromarray(processed)
            else:
                processed_img = img

            # Get OCR data with confidence
            data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
            text = pytesseract.image_to_string(processed_img, lang=lang)

            # Calculate average confidence
            confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
            avg_confidence = sum(confidences) / len(confidences) if confidences else 0

            if text.strip() and avg_confidence > best_confidence:
                best_text = text
                best_confidence = avg_confidence
                best_method = method

        except Exception as e:
            continue

    return best_text.strip(), best_confidence, best_method


def ocr_single_image(
    image_bytes: bytes,
    source_lang: Optional[str] = None,
    target_lang: str = "en",
    use_paddle: bool = True,
) -> Dict[str, Any]:
    """
    Extract text from a single image and translate.

    Args:
        image_bytes: Raw image bytes
        source_lang: Source language hint (auto-detect if None)
        target_lang: Target language for translation
        use_paddle: Whether to try PaddleOCR first

    Returns:
        Dict with original_text, translated_text, detected_language, confidence, method
    """
    best_text = ""
    best_method = ""
    best_confidence = 0

    # Determine Tesseract language string
    tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
    if source_lang:
        mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
        if mapped:
            tess_lang = mapped

    # Try PaddleOCR first (best for Chinese)
    if use_paddle and HAS_PADDLEOCR:
        paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
        if paddle_text and paddle_text.strip():
            best_text = paddle_text
            best_method = "PaddleOCR"
            best_confidence = paddle_conf

    # Try Tesseract (fallback or if PaddleOCR failed)
    if not best_text.strip():
        tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
        if tess_text and (tess_conf > best_confidence or not best_text):
            best_text = tess_text
            best_method = f"Tesseract-{tess_method}"
            best_confidence = tess_conf

    if not best_text.strip():
        return {
            "original_text": "",
            "translated_text": "",
            "detected_language": "unknown",
            "confidence": 0,
            "method": "none",
            "error": "No text detected"
        }

    # Filter pinyin for Chinese text
    filtered_text = filter_pinyin_keep_chinese(best_text)
    if not filtered_text.strip():
        filtered_text = best_text

    # Detect language
    detected_lang = detect_language_from_text(filtered_text)

    # Translate
    try:
        source = LANG_CODE_MAP.get(detected_lang, detected_lang)
        target = LANG_CODE_MAP.get(target_lang, target_lang)
        translator = GoogleTranslator(source=source, target=target)
        translated = translator.translate(filtered_text)
    except Exception as e:
        translated = ""

    return {
        "original_text": filtered_text.strip(),
        "translated_text": translated.strip() if translated else "",
        "detected_language": detected_lang,
        "confidence": round(best_confidence, 2),
        "method": best_method
    }


def ocr_and_translate_batch(
    images: List[bytes],
    target_lang: str = "en",
    prefer_ocr_local: bool = True,
) -> List[Dict]:
    """
    Runs OCR on a batch of images with advanced processing.

    Args:
        images: List of image bytes
        target_lang: Target language for translation
        prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)

    Returns:
        List of dicts with OCR results
    """
    results = []

    for img_bytes in images:
        result = ocr_single_image(
            image_bytes=img_bytes,
            target_lang=target_lang,
            use_paddle=prefer_ocr_local and HAS_PADDLEOCR
        )

        # Convert to expected format for backward compatibility
        results.append({
            "text": result.get("original_text", ""),
            "translation": result.get("translated_text", ""),
            "target_lang": target_lang,
            "detected_language": result.get("detected_language", "unknown"),
            "confidence": result.get("confidence", 0),
            "method": result.get("method", "unknown"),
        })

    return results


# Keep old function for backward compatibility
def _simple_ocr(image_bytes: bytes) -> str:
    """Simple OCR using pytesseract (backward compatibility)"""
    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    text = pytesseract.image_to_string(img)
    return text.strip()