agentic-language-partner

Running

File size: 11,342 Bytes

# -*- coding: utf-8 -*-
"""
OCR Tools - Advanced text extraction with multi-language support
Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
"""

import io
import re
from typing import Any, Dict, List, Optional

import numpy as np
from PIL import Image
import pytesseract
from deep_translator import GoogleTranslator

# Try to import optional dependencies
try:
    import cv2
    HAS_CV2 = True
except ImportError:
    HAS_CV2 = False

try:
    from langdetect import detect
    HAS_LANGDETECT = True
except ImportError:
    HAS_LANGDETECT = False

try:
    from paddleocr import PaddleOCR
    HAS_PADDLEOCR = True
    _paddle_ocr = None
except ImportError:
    HAS_PADDLEOCR = False
    _paddle_ocr = None


# Language code mapping
LANG_CODE_MAP = {
    'zh-cn': 'zh-CN',
    'zh-tw': 'zh-TW',
    'en': 'en',
    'ja': 'ja',
    'ko': 'ko',
    'fr': 'fr',
    'de': 'de',
    'es': 'es',
    'ru': 'ru',
}

# Tesseract language codes for each supported language
TESSERACT_LANG_MAP = {
    'en': 'eng',
    'english': 'eng',
    'zh-cn': 'chi_sim',
    'chinese': 'chi_sim',
    'zh-tw': 'chi_tra',
    'ja': 'jpn',
    'japanese': 'jpn',
    'ko': 'kor',
    'korean': 'kor',
    'de': 'deu',
    'german': 'deu',
    'es': 'spa',
    'spanish': 'spa',
    'ru': 'rus',
    'russian': 'rus',
    'fr': 'fra',
    'french': 'fra',
}


def _get_paddle_ocr():
    """Lazily initialize PaddleOCR"""
    global _paddle_ocr
    if HAS_PADDLEOCR and _paddle_ocr is None:
        try:
            _paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
        except Exception as e:
            print(f"[OCR] PaddleOCR init failed: {e}")
    return _paddle_ocr


def filter_pinyin_keep_chinese(text: str) -> str:
    """
    Filter out pinyin and keep only Chinese characters.
    Preserves complete sentences with Chinese characters.
    """
    lines = text.split('\n')
    filtered_lines = []

    for line in lines:
        line_stripped = line.strip()
        if not line_stripped:
            continue

        # Check if line contains Chinese characters
        has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))

        # Check if line is pure pinyin
        is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))

        if is_pinyin:
            continue

        if has_chinese:
            chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
            if chinese_parts:
                filtered_lines.append(''.join(chinese_parts))

    return '\n'.join(filtered_lines)


def detect_language_from_text(text: str) -> str:
    """Detect language, with special handling for Chinese characters"""
    has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
    if has_chinese:
        return 'zh-cn'

    has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
    if has_japanese:
        return 'ja'

    has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
    if has_korean:
        return 'ko'

    if HAS_LANGDETECT:
        try:
            return detect(text)
        except:
            pass

    return 'en'


def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
    """Apply image preprocessing for better OCR accuracy"""
    if not HAS_CV2:
        return img_array

    # Convert to grayscale if needed
    if len(img_array.shape) == 3:
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = img_array

    if method == 'simple':
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary
    elif method == 'adaptive':
        return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    elif method == 'clahe':
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary
    elif method == 'denoised':
        kernel = np.ones((2, 2), np.uint8)
        denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
        _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary
    elif method == 'advanced':
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
        return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    else:
        return gray


def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
    """Use PaddleOCR for text extraction (best for Chinese)"""
    paddle = _get_paddle_ocr()
    if paddle is None:
        return None, 0

    try:
        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        img_array = np.array(img)

        result = paddle.ocr(img_array, cls=True)

        if not result or len(result) == 0 or result[0] is None:
            return None, 0

        texts = []
        scores = []
        for line in result[0]:
            if line and len(line) >= 2:
                text_info = line[1]
                if isinstance(text_info, tuple) and len(text_info) >= 2:
                    texts.append(text_info[0])
                    scores.append(text_info[1])

        if not texts:
            return None, 0

        full_text = '\n'.join(texts)
        avg_confidence = sum(scores) / len(scores) if scores else 0

        return full_text, avg_confidence * 100

    except Exception as e:
        print(f"[OCR] PaddleOCR error: {e}")
        return None, 0


def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
    """Use Tesseract with multiple preprocessing methods"""
    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    img_array = np.array(img)

    best_text = ""
    best_confidence = 0
    best_method = ""

    # Try different preprocessing methods
    methods = ['simple', 'adaptive', 'clahe', 'denoised']
    if HAS_CV2:
        methods.append('advanced')

    for method in methods:
        try:
            if HAS_CV2:
                processed = _preprocess_image(img_array, method)
                processed_img = Image.fromarray(processed)
            else:
                processed_img = img

            # Get OCR data with confidence
            data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
            text = pytesseract.image_to_string(processed_img, lang=lang)

            # Calculate average confidence
            confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
            avg_confidence = sum(confidences) / len(confidences) if confidences else 0

            if text.strip() and avg_confidence > best_confidence:
                best_text = text
                best_confidence = avg_confidence
                best_method = method

        except Exception as e:
            continue

    return best_text.strip(), best_confidence, best_method


def ocr_single_image(
    image_bytes: bytes,
    source_lang: Optional[str] = None,
    target_lang: str = "en",
    use_paddle: bool = True,
) -> Dict[str, Any]:
    """
    Extract text from a single image and translate.

    Args:
        image_bytes: Raw image bytes
        source_lang: Source language hint (auto-detect if None)
        target_lang: Target language for translation
        use_paddle: Whether to try PaddleOCR first

    Returns:
        Dict with original_text, translated_text, detected_language, confidence, method
    """
    best_text = ""
    best_method = ""
    best_confidence = 0

    # Determine Tesseract language string
    tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
    if source_lang:
        mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
        if mapped:
            tess_lang = mapped

    # Try PaddleOCR first (best for Chinese)
    if use_paddle and HAS_PADDLEOCR:
        paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
        if paddle_text and paddle_text.strip():
            best_text = paddle_text
            best_method = "PaddleOCR"
            best_confidence = paddle_conf

    # Try Tesseract (fallback or if PaddleOCR failed)
    if not best_text.strip():
        tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
        if tess_text and (tess_conf > best_confidence or not best_text):
            best_text = tess_text
            best_method = f"Tesseract-{tess_method}"
            best_confidence = tess_conf

    if not best_text.strip():
        return {
            "original_text": "",
            "translated_text": "",
            "detected_language": "unknown",
            "confidence": 0,
            "method": "none",
            "error": "No text detected"
        }

    # Filter pinyin for Chinese text
    filtered_text = filter_pinyin_keep_chinese(best_text)
    if not filtered_text.strip():
        filtered_text = best_text

    # Detect language
    detected_lang = detect_language_from_text(filtered_text)

    # Translate
    try:
        source = LANG_CODE_MAP.get(detected_lang, detected_lang)
        target = LANG_CODE_MAP.get(target_lang, target_lang)
        translator = GoogleTranslator(source=source, target=target)
        translated = translator.translate(filtered_text)
    except Exception as e:
        translated = ""

    return {
        "original_text": filtered_text.strip(),
        "translated_text": translated.strip() if translated else "",
        "detected_language": detected_lang,
        "confidence": round(best_confidence, 2),
        "method": best_method
    }


def ocr_and_translate_batch(
    images: List[bytes],
    target_lang: str = "en",
    prefer_ocr_local: bool = True,
) -> List[Dict]:
    """
    Runs OCR on a batch of images with advanced processing.

    Args:
        images: List of image bytes
        target_lang: Target language for translation
        prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)

    Returns:
        List of dicts with OCR results
    """
    results = []

    for img_bytes in images:
        result = ocr_single_image(
            image_bytes=img_bytes,
            target_lang=target_lang,
            use_paddle=prefer_ocr_local and HAS_PADDLEOCR
        )

        # Convert to expected format for backward compatibility
        results.append({
            "text": result.get("original_text", ""),
            "translation": result.get("translated_text", ""),
            "target_lang": target_lang,
            "detected_language": result.get("detected_language", "unknown"),
            "confidence": result.get("confidence", 0),
            "method": result.get("method", "unknown"),
        })

    return results


# Keep old function for backward compatibility
def _simple_ocr(image_bytes: bytes) -> str:
    """Simple OCR using pytesseract (backward compatibility)"""
    img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    text = pytesseract.image_to_string(img)
    return text.strip()