Spaces:

Ash2749
/

math-ocr

Running

File size: 28,416 Bytes

1a3e965

import cv2
import pytesseract
from pytesseract import Output
from pdf2image import convert_from_path
import numpy as np
import json
from tqdm import tqdm
import unicodedata
from collections import defaultdict
from PIL import Image
import logging


try:
    from pix2text import Pix2Text

    PIX2TEXT_AVAILABLE = True
    print("Pix2Text imported successfully for advanced math extraction")
except ImportError:
    PIX2TEXT_AVAILABLE = False
    print("Pix2Text not available. Install with: pip install pix2text")
    print("   Falling back to traditional OCR for math expressions")


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# ----------------------------
# STEP 1: Enhanced Character Classification
# ----------------------------
def classify_character(char):
    """

    Classify a single character as English, Bangla, Math, or Other.

    Enhanced for better math detection.

    """
    if not char or char.isspace():
        return "space"

    # Unicode ranges for Bangla
    if "\u0980" <= char <= "\u09ff":  # Bangla unicode range
        return "bangla"

    # Enhanced mathematical symbols and operators
    math_chars = set(
        "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬"
        "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
        "±≈≠≡⇒⇔∘∗⊕⊗⊙⊥∥∦∝∞"
    )

    # Extended math ranges
    math_ranges = [
        ("\u2200", "\u22ff"),  # Mathematical Operators
        ("\u2190", "\u21ff"),  # Arrows
        ("\u0370", "\u03ff"),  # Greek and Coptic
        ("\u2070", "\u209f"),  # Superscripts and Subscripts
        ("\u27c0", "\u27ef"),  # Miscellaneous Mathematical Symbols-A
        ("\u2980", "\u29ff"),  # Miscellaneous Mathematical Symbols-B
    ]

    if char in math_chars:
        return "math"

    for start, end in math_ranges:
        if start <= char <= end:
            return "math"

    # Numbers (also often mathematical)
    if char.isdigit():
        return "number"

    # English letters
    if char.isascii() and char.isalpha():
        return "english"

    # Mathematical punctuation
    if char in ".,;:!?()[]{}\"'-_/\\^":
        return "punctuation"

    return "other"


def classify_text_region(text):
    """

    Enhanced text region classification with better math detection.

    """
    if not text.strip():
        return "empty"

    char_counts = defaultdict(int)
    for char in text:
        char_type = classify_character(char)
        char_counts[char_type] += 1

    # Remove spaces from consideration
    significant_chars = {k: v for k, v in char_counts.items() if k not in ["space"]}

    if not significant_chars:
        return "empty"

    total_significant = sum(significant_chars.values())
    percentages = {k: v / total_significant for k, v in significant_chars.items()}

    # Enhanced classification logic
    math_indicators = percentages.get("math", 0) + percentages.get("number", 0) * 0.5

    if percentages.get("bangla", 0) > 0.5:
        return "bangla"
    elif math_indicators > 0.3 or has_math_patterns(text):
        return "math"
    elif percentages.get("english", 0) > 0.5:
        return "english"
    else:
        return "mixed"


def has_math_patterns(text):
    """

    Detect mathematical patterns in text using regex and heuristics.

    """
    import re

    # Common mathematical patterns
    math_patterns = [
        r"\d+[\+\-\*/=]\d+",  # Simple arithmetic
        r"[xy]\^?\d+",  # Variables with powers
        r"\\[a-zA-Z]+",  # LaTeX commands
        r"\$.*?\$",  # LaTeX inline math
        r"[a-zA-Z]\([a-zA-Z,\d\s]+\)",  # Functions like f(x)
        r"\b(sin|cos|tan|log|ln|exp|sqrt|int|sum|lim)\b",  # Math functions
        r"[≤≥≠≈∫∑∂∞]",  # Math symbols
    ]

    for pattern in math_patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True

    return False


# ----------------------------
# STEP 2: Initialize Pix2Text
# ----------------------------
def initialize_pix2text():
    """Initialize Pix2Text model for mathematical expression extraction."""
    if not PIX2TEXT_AVAILABLE:
        return None

    try:
        # Initialize Pix2Text with specific configuration for math
        # Try different initialization methods
        logger.info("Initializing Pix2Text...")

        # Method 1: Default initialization
        try:
            p2t = Pix2Text.from_config()
            logger.info("✅ Pix2Text initialized with default config")
            return p2t
        except Exception as e1:
            logger.warning(f"Default Pix2Text init failed: {e1}")

        # Method 2: Try with specific config
        try:
            p2t = Pix2Text()
            logger.info("✅ Pix2Text initialized with basic constructor")
            return p2t
        except Exception as e2:
            logger.warning(f"Basic Pix2Text init failed: {e2}")

        # Method 3: Try with minimal config
        try:
            config = {"device": "cpu"}  # Force CPU to avoid CUDA issues
            p2t = Pix2Text.from_config(config)
            logger.info("✅ Pix2Text initialized with CPU config")
            return p2t
        except Exception as e3:
            logger.error(f"All Pix2Text initialization methods failed: {e3}")

        return None

    except Exception as e:
        logger.error(f"❌ Failed to initialize Pix2Text: {e}")
        return None


# ----------------------------
# STEP 3: Enhanced Image Preprocessing
# ----------------------------
def preprocess_image_advanced(pil_image):
    """Enhanced image preprocessing with multiple techniques."""
    img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Noise reduction
    gray = cv2.fastNlMeansDenoising(gray, h=15)

    # Adaptive thresholding for better text separation
    binary = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 5
    )

    # Enhance contrast
    enhanced = cv2.convertScaleAbs(binary, alpha=1.2, beta=10)

    # Scale up for better OCR accuracy
    height, width = enhanced.shape
    scaled = cv2.resize(
        enhanced, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC
    )

    return scaled


def preprocess_for_pix2text(pil_image, region):
    """

    Special preprocessing for Pix2Text mathematical expression extraction.

    """
    # Convert PIL to numpy array
    img = np.array(pil_image)

    # Crop the specific region
    x, y, w, h = region["left"], region["top"], region["width"], region["height"]

    # Validate region dimensions
    if w <= 0 or h <= 0:
        logger.warning(f"Invalid region dimensions: w={w}, h={h}. Skipping Pix2Text.")
        return None

    # Add padding around the math region for better recognition
    padding = 10
    x_start = max(0, x - padding)
    y_start = max(0, y - padding)
    x_end = min(img.shape[1], x + w + padding)
    y_end = min(img.shape[0], y + h + padding)

    # Validate cropping bounds
    if x_end <= x_start or y_end <= y_start:
        logger.warning(
            f"Invalid crop bounds: x({x_start}:{x_end}), y({y_start}:{y_end}). Skipping Pix2Text."
        )
        return None

    cropped = img[y_start:y_end, x_start:x_end]

    # Check if crop resulted in empty image
    if cropped.size == 0:
        logger.warning("Cropped image is empty. Skipping Pix2Text.")
        return None

    # Convert back to PIL Image
    try:
        cropped_pil = Image.fromarray(cropped)
    except Exception as e:
        logger.error(f"Failed to create PIL image from cropped array: {e}")
        return None

    # Ensure minimum size for Pix2Text
    min_size = 32
    if cropped_pil.width <= 0 or cropped_pil.height <= 0:
        logger.warning(
            f"Invalid PIL image dimensions: {cropped_pil.width}x{cropped_pil.height}"
        )
        return None

    if cropped_pil.width < min_size or cropped_pil.height < min_size:
        # Resize maintaining aspect ratio
        try:
            ratio = max(min_size / cropped_pil.width, min_size / cropped_pil.height)
            new_width = int(cropped_pil.width * ratio)
            new_height = int(cropped_pil.height * ratio)

            # Ensure new dimensions are valid
            if new_width <= 0 or new_height <= 0:
                logger.warning(f"Invalid resized dimensions: {new_width}x{new_height}")
                return None

            cropped_pil = cropped_pil.resize((new_width, new_height), Image.LANCZOS)
        except Exception as e:
            logger.error(f"Failed to resize image: {e}")
            return None

    return cropped_pil


# ----------------------------
# STEP 4: Text Detection and Line Segmentation
# ----------------------------
def detect_text_regions(image):
    """Detect text regions and classify them by line and character type."""
    data = pytesseract.image_to_data(image, output_type=Output.DICT, lang="eng+ben")

    text_regions = []
    for i in range(len(data["text"])):
        text = data["text"][i].strip()
        if text and int(data["conf"][i]) > 25:  # Lowered threshold for math
            # Validate region dimensions
            width = int(data["width"][i])
            height = int(data["height"][i])
            left = int(data["left"][i])
            top = int(data["top"][i])

            # Skip regions with invalid dimensions
            if width <= 0 or height <= 0:
                logger.debug(
                    f"Skipping region with invalid dimensions: {width}x{height}"
                )
                continue

            # Skip regions that are too small to be meaningful
            if width < 3 or height < 3:
                logger.debug(f"Skipping tiny region: {width}x{height}")
                continue

            region = {
                "text": text,
                "left": left,
                "top": top,
                "width": width,
                "height": height,
                "confidence": int(data["conf"][i]),
                "type": classify_text_region(text),
            }
            text_regions.append(region)

    logger.info(f"Detected {len(text_regions)} valid text regions")
    return text_regions


def group_regions_by_line(regions, line_tolerance=15):
    """Group text regions into lines with better tolerance for math expressions."""
    if not regions:
        return []

    regions_sorted = sorted(regions, key=lambda x: x["top"])

    lines = []
    current_line = [regions_sorted[0]]
    current_top = regions_sorted[0]["top"]

    for region in regions_sorted[1:]:
        # More flexible line grouping for mathematical expressions
        # Handle zero heights safely
        current_height = max(1, current_line[0]["height"])  # Avoid division by zero
        region_height = max(1, region["height"])  # Avoid division by zero
        height_avg = (current_height + region_height) / 2
        tolerance = max(line_tolerance, height_avg * 0.3)

        if abs(region["top"] - current_top) <= tolerance:
            current_line.append(region)
        else:
            current_line.sort(key=lambda x: x["left"])
            lines.append(current_line)
            current_line = [region]
            current_top = region["top"]

    if current_line:
        current_line.sort(key=lambda x: x["left"])
        lines.append(current_line)

    return lines


# ----------------------------
# STEP 5: Advanced OCR Extractors
# ----------------------------
def extract_english_region(image, region):
    """Extract English text from a specific region with optimized settings."""
    x, y, w, h = region["left"], region["top"], region["width"], region["height"]

    roi = image[y : y + h, x : x + w]
    if roi.size == 0:
        return region["text"]

    config = r"--oem 3 --psm 8 -l eng"
    try:
        result = pytesseract.image_to_string(roi, config=config).strip()
        return result if result else region["text"]
    except Exception:
        return region["text"]


def extract_bangla_region(image, region):
    """Extract Bangla text from a specific region with optimized settings."""
    x, y, w, h = region["left"], region["top"], region["width"], region["height"]

    roi = image[y : y + h, x : x + w]
    if roi.size == 0:
        return region["text"]

    config = r"--oem 3 --psm 8 -l ben"
    try:
        result = pytesseract.image_to_string(roi, config=config).strip()
        return result if result else region["text"]
    except Exception:
        return region["text"]


def extract_math_region_pix2text(pil_image, region, p2t_model):
    """

    Extract mathematical expressions using Pix2Text with fallback to traditional OCR.

    """
    if not p2t_model:
        return extract_math_region_traditional(pil_image, region)

    try:
        # Preprocess image for Pix2Text
        math_image = preprocess_for_pix2text(pil_image, region)

        # If preprocessing failed, fall back to traditional OCR
        if math_image is None:
            logger.warning(
                "Pix2Text preprocessing failed, falling back to traditional OCR"
            )
            return extract_math_region_traditional(pil_image, region)

        # Use Pix2Text to extract mathematical expressions
        result = p2t_model(math_image)

        # Enhanced result parsing to handle different Pix2Text response formats
        extracted_text = parse_pix2text_result(result)

        if extracted_text and extracted_text.strip():
            # Filter out invalid responses
            if not is_valid_pix2text_result(extracted_text):
                logger.warning(f"Invalid Pix2Text result: {extracted_text[:100]}...")
                return extract_math_region_traditional(pil_image, region)

            logger.info(f"✅ Pix2Text extracted: {extracted_text[:50]}...")
            return extracted_text.strip()
        else:
            logger.warning(
                "⚠️  Pix2Text returned empty result, falling back to traditional OCR"
            )
            return extract_math_region_traditional(pil_image, region)

    except Exception as e:
        logger.error(f"❌ Pix2Text extraction failed: {e}")
        return extract_math_region_traditional(pil_image, region)


def parse_pix2text_result(result):
    """

    Parse Pix2Text result handling various response formats.

    """
    try:
        if isinstance(result, dict):
            # Handle different Pix2Text response formats
            # Try common keys for mathematical content
            for key in ["text", "formula", "latex", "content", "output"]:
                if key in result and result[key]:
                    return str(result[key])

            # If no specific key found, convert entire dict to string
            # but filter out obviously bad content
            result_str = str(result)
            if len(result_str) > 1000:  # Too long, likely debug info
                return ""
            return result_str

        elif isinstance(result, list):
            # Handle list responses
            if not result:
                return ""

            # Join list elements that look like mathematical content
            valid_items = []
            for item in result:
                item_str = str(item).strip()
                if item_str and not is_debug_content(item_str):
                    valid_items.append(item_str)

            return " ".join(valid_items)

        elif isinstance(result, str):
            return result
        else:
            return str(result)

    except Exception as e:
        logger.error(f"Error parsing Pix2Text result: {e}")
        return ""


def is_valid_pix2text_result(text):
    """

    Check if the Pix2Text result is valid mathematical content.

    """
    if not text or not text.strip():
        return False

    text = text.strip()

    # Filter out obvious debug/error content
    invalid_patterns = [
        "Page(id=",
        "elements=[]",
        "number=0",
        "Error:",
        "Exception:",
        "Traceback:",
        "DEBUG:",
        "INFO:",
        "WARNING:",
        "ERROR:",
    ]

    for pattern in invalid_patterns:
        if pattern in text:
            return False

    # Must have some reasonable length for math content
    if len(text) < 1:
        return False

    # Should contain some mathematical or textual content
    # Allow mathematical symbols, letters, numbers, basic punctuation
    import re

    if re.search(r"[a-zA-Z0-9=+\-*/(){}[\]^_√∫∑∂πθαβγδλμΩ]", text):
        return True

    return False


def is_debug_content(text):
    """

    Check if text appears to be debug/logging content rather than actual content.

    """
    debug_indicators = [
        "Page(",
        "id=",
        "number=",
        "elements=",
        "[])",
        "DEBUG",
        "INFO",
        "WARNING",
        "ERROR",
        "Exception",
        "Traceback",
        'File "',
        "line ",
        " at 0x",
    ]

    for indicator in debug_indicators:
        if indicator in text:
            return True

    return False


def extract_math_region_traditional(pil_image, region):
    """

    Fallback traditional OCR for mathematical expressions.

    """
    # Convert PIL to OpenCV format
    img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    x, y, w, h = region["left"], region["top"], region["width"], region["height"]
    roi = gray[y : y + h, x : x + w]

    if roi.size == 0:
        return region["text"]

    # Math-optimized OCR with expanded symbol whitelist
    math_chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇()[]{}.,;:^_αβγδλμθΩ±≈≠≡⇒⇔"
    config = f"--oem 3 --psm 6 -c tessedit_char_whitelist={math_chars}"

    try:
        result = pytesseract.image_to_string(roi, config=config).strip()
        return result if result else region["text"]
    except Exception:
        return region["text"]


def extract_mixed_region(pil_image, region, p2t_model):
    """Extract mixed content using multiple approaches."""
    # Convert PIL to OpenCV for traditional OCR
    img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    eng_result = extract_english_region(gray, region)
    bangla_result = extract_bangla_region(gray, region)

    # If it might contain math, try Pix2Text too
    if has_math_patterns(region["text"]):
        math_result = extract_math_region_pix2text(pil_image, region, p2t_model)
        # Choose the longest non-empty result
        results = [r for r in [eng_result, bangla_result, math_result] if r.strip()]
        return max(results, key=len) if results else region["text"]

    # Choose between English and Bangla
    return bangla_result if len(bangla_result) > len(eng_result) else eng_result


# ----------------------------
# STEP 6: Character Analysis (unchanged)
# ----------------------------
def analyze_character_by_character(text):
    """Analyze text character by character to identify language patterns."""
    analysis = {
        "characters": [],
        "language_segments": [],
        "total_chars": len(text),
        "language_distribution": defaultdict(int),
    }

    for i, char in enumerate(text):
        char_type = classify_character(char)
        analysis["characters"].append(
            {
                "char": char,
                "position": i,
                "type": char_type,
                "unicode_name": unicodedata.name(char, "UNKNOWN"),
            }
        )
        analysis["language_distribution"][char_type] += 1

    # Create language segments
    current_segment = None
    for char_info in analysis["characters"]:
        if char_info["type"] in ["space", "punctuation"]:
            continue

        if current_segment is None or current_segment["type"] != char_info["type"]:
            if current_segment:
                analysis["language_segments"].append(current_segment)
            current_segment = {
                "type": char_info["type"],
                "start": char_info["position"],
                "end": char_info["position"],
                "text": char_info["char"],
            }
        else:
            current_segment["end"] = char_info["position"]
            current_segment["text"] += char_info["char"]

    if current_segment:
        analysis["language_segments"].append(current_segment)

    return analysis


# ----------------------------
# STEP 7: Main Processing Pipeline
# ----------------------------
def process_page_advanced(page_image, page_num, p2t_model):
    """

    Advanced page processing with Pix2Text integration.

    """
    print(f"Processing page {page_num + 1}...")

    # Preprocess image
    processed_image = preprocess_image_advanced(page_image)

    # Detect text regions
    regions = detect_text_regions(processed_image)

    # Group regions by lines
    lines = group_regions_by_line(regions)

    page_results = []

    for line_num, line in enumerate(lines):
        line_text_parts = []

        for region in line:
            # Choose appropriate extractor based on region type
            if region["type"] == "english":
                extracted_text = extract_english_region(processed_image, region)
            elif region["type"] == "bangla":
                extracted_text = extract_bangla_region(processed_image, region)
            elif region["type"] == "math":
                extracted_text = extract_math_region_pix2text(
                    page_image, region, p2t_model
                )
            elif region["type"] == "mixed":
                extracted_text = extract_mixed_region(page_image, region, p2t_model)
            else:
                extracted_text = region["text"]

            # Character-by-character analysis
            char_analysis = analyze_character_by_character(extracted_text)

            region_result = {
                "page": page_num,
                "line": line_num,
                "text": extracted_text,
                "original_text": region["text"],
                "position": {
                    "left": region["left"],
                    "top": region["top"],
                    "width": region["width"],
                    "height": region["height"],
                },
                "confidence": region["confidence"],
                "detected_type": region["type"],
                "extraction_method": "pix2text"
                if region["type"] == "math" and p2t_model
                else "tesseract",
                "character_analysis": char_analysis,
            }

            page_results.append(region_result)
            line_text_parts.append(extracted_text)

        # Log line information
        if line_text_parts:
            line_text = " ".join(line_text_parts)
            print(f"  Line {line_num + 1}: {line_text[:100]}...")

    return page_results


def extract_all_text_advanced_pix2text(

    pdf_path, output_text_file, output_json_file, output_analysis_file

):
    """

    Advanced text extraction with Pix2Text integration.

    """
    print("[INFO] Initializing Pix2Text for mathematical expression extraction...")
    p2t_model = initialize_pix2text()

    if p2t_model:
        print("✅ Pix2Text ready for advanced math extraction")
    else:
        print("⚠️  Using traditional OCR for math expressions")

    print("[INFO] Converting PDF to images...")
    pages = convert_from_path(pdf_path, dpi=300)

    all_results = []
    combined_text_parts = []

    for page_num, page_image in enumerate(tqdm(pages, desc="Processing pages")):
        page_results = process_page_advanced(page_image, page_num, p2t_model)
        all_results.extend(page_results)

        # Build page text
        page_text_parts = [result["text"] for result in page_results]
        page_text = " ".join(page_text_parts)
        combined_text_parts.append(page_text)

    # Combine all text
    final_text = "\n\n".join(combined_text_parts)

    # Save text file
    with open(output_text_file, "w", encoding="utf-8") as f:
        f.write(final_text)

    # Save detailed JSON results
    with open(output_json_file, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)

    # Create summary analysis
    summary_analysis = create_extraction_summary(all_results)
    with open(output_analysis_file, "w", encoding="utf-8") as f:
        json.dump(summary_analysis, f, ensure_ascii=False, indent=2)

    print("\n[✅] Advanced Pix2Text extraction complete!")
    print(f"→ Text file saved to: {output_text_file}")
    print(f"→ Detailed JSON saved to: {output_json_file}")
    print(f"→ Analysis report saved to: {output_analysis_file}")

    # Print summary
    print("\n📊 Extraction Summary:")
    print(f"   Total text regions: {len(all_results)}")
    print(f"   English regions: {summary_analysis['type_distribution']['english']}")
    print(f"   Bangla regions: {summary_analysis['type_distribution']['bangla']}")
    print(f"   Math regions: {summary_analysis['type_distribution']['math']}")
    print(f"   Mixed regions: {summary_analysis['type_distribution']['mixed']}")

    # Show extraction method statistics
    method_stats = defaultdict(int)
    for result in all_results:
        method_stats[result.get("extraction_method", "unknown")] += 1

    print("\n🔧 Extraction Methods Used:")
    for method, count in method_stats.items():
        print(f"   {method}: {count} regions")


def create_extraction_summary(results):
    """Create a comprehensive summary of the extraction results."""
    summary = {
        "total_regions": len(results),
        "total_pages": len(set(r["page"] for r in results)),
        "type_distribution": defaultdict(int),
        "character_distribution": defaultdict(int),
        "confidence_stats": {"min": 100, "max": 0, "avg": 0},
        "language_segments_summary": defaultdict(int),
        "extraction_methods": defaultdict(int),
    }

    total_confidence = 0
    for result in results:
        summary["type_distribution"][result["detected_type"]] += 1
        summary["extraction_methods"][result.get("extraction_method", "unknown")] += 1

        conf = result["confidence"]
        total_confidence += conf
        summary["confidence_stats"]["min"] = min(
            summary["confidence_stats"]["min"], conf
        )
        summary["confidence_stats"]["max"] = max(
            summary["confidence_stats"]["max"], conf
        )

        # Character distribution
        char_analysis = result["character_analysis"]
        for char_type, count in char_analysis["language_distribution"].items():
            summary["character_distribution"][char_type] += count

        # Language segments
        for segment in char_analysis["language_segments"]:
            summary["language_segments_summary"][segment["type"]] += 1

    if results:
        summary["confidence_stats"]["avg"] = total_confidence / len(results)

    return summary


# ----------------------
# MAIN EXECUTION SECTION
# ----------------------
if __name__ == "__main__":
    pdf_path = r"math102.pdf"
    output_text_file = "math102_pix2text.txt"
    output_json_file = "math102_pix2text.json"
    output_analysis_file = "math102_pix2text_analysis.json"

    extract_all_text_advanced_pix2text(
        pdf_path, output_text_file, output_json_file, output_analysis_file
    )