| | import cv2
|
| | import pytesseract
|
| | from pytesseract import Output
|
| | from pdf2image import convert_from_path
|
| | import numpy as np
|
| | import json
|
| | from tqdm import tqdm
|
| | import unicodedata
|
| | from collections import defaultdict
|
| | from PIL import Image
|
| | import logging
|
| |
|
| |
|
| | try:
|
| | from pix2text import Pix2Text
|
| |
|
| | PIX2TEXT_AVAILABLE = True
|
| | print("Pix2Text imported successfully for advanced math extraction")
|
| | except ImportError:
|
| | PIX2TEXT_AVAILABLE = False
|
| | print("Pix2Text not available. Install with: pip install pix2text")
|
| | print(" Falling back to traditional OCR for math expressions")
|
| |
|
| |
|
| | logging.basicConfig(level=logging.INFO)
|
| | logger = logging.getLogger(__name__)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def classify_character(char):
|
| | """
|
| | Classify a single character as English, Bangla, Math, or Other.
|
| | Enhanced for better math detection.
|
| | """
|
| | if not char or char.isspace():
|
| | return "space"
|
| |
|
| |
|
| | if "\u0980" <= char <= "\u09ff":
|
| | return "bangla"
|
| |
|
| |
|
| | math_chars = set(
|
| | "=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇∀∃∈∉⊂⊃⊆⊇∪∩∧∨¬"
|
| | "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
|
| | "±≈≠≡⇒⇔∘∗⊕⊗⊙⊥∥∦∝∞"
|
| | )
|
| |
|
| |
|
| | math_ranges = [
|
| | ("\u2200", "\u22ff"),
|
| | ("\u2190", "\u21ff"),
|
| | ("\u0370", "\u03ff"),
|
| | ("\u2070", "\u209f"),
|
| | ("\u27c0", "\u27ef"),
|
| | ("\u2980", "\u29ff"),
|
| | ]
|
| |
|
| | if char in math_chars:
|
| | return "math"
|
| |
|
| | for start, end in math_ranges:
|
| | if start <= char <= end:
|
| | return "math"
|
| |
|
| |
|
| | if char.isdigit():
|
| | return "number"
|
| |
|
| |
|
| | if char.isascii() and char.isalpha():
|
| | return "english"
|
| |
|
| |
|
| | if char in ".,;:!?()[]{}\"'-_/\\^":
|
| | return "punctuation"
|
| |
|
| | return "other"
|
| |
|
| |
|
| | def classify_text_region(text):
|
| | """
|
| | Enhanced text region classification with better math detection.
|
| | """
|
| | if not text.strip():
|
| | return "empty"
|
| |
|
| | char_counts = defaultdict(int)
|
| | for char in text:
|
| | char_type = classify_character(char)
|
| | char_counts[char_type] += 1
|
| |
|
| |
|
| | significant_chars = {k: v for k, v in char_counts.items() if k not in ["space"]}
|
| |
|
| | if not significant_chars:
|
| | return "empty"
|
| |
|
| | total_significant = sum(significant_chars.values())
|
| | percentages = {k: v / total_significant for k, v in significant_chars.items()}
|
| |
|
| |
|
| | math_indicators = percentages.get("math", 0) + percentages.get("number", 0) * 0.5
|
| |
|
| | if percentages.get("bangla", 0) > 0.5:
|
| | return "bangla"
|
| | elif math_indicators > 0.3 or has_math_patterns(text):
|
| | return "math"
|
| | elif percentages.get("english", 0) > 0.5:
|
| | return "english"
|
| | else:
|
| | return "mixed"
|
| |
|
| |
|
| | def has_math_patterns(text):
|
| | """
|
| | Detect mathematical patterns in text using regex and heuristics.
|
| | """
|
| | import re
|
| |
|
| |
|
| | math_patterns = [
|
| | r"\d+[\+\-\*/=]\d+",
|
| | r"[xy]\^?\d+",
|
| | r"\\[a-zA-Z]+",
|
| | r"\$.*?\$",
|
| | r"[a-zA-Z]\([a-zA-Z,\d\s]+\)",
|
| | r"\b(sin|cos|tan|log|ln|exp|sqrt|int|sum|lim)\b",
|
| | r"[≤≥≠≈∫∑∂∞]",
|
| | ]
|
| |
|
| | for pattern in math_patterns:
|
| | if re.search(pattern, text, re.IGNORECASE):
|
| | return True
|
| |
|
| | return False
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def initialize_pix2text():
|
| | """Initialize Pix2Text model for mathematical expression extraction."""
|
| | if not PIX2TEXT_AVAILABLE:
|
| | return None
|
| |
|
| | try:
|
| |
|
| |
|
| | logger.info("Initializing Pix2Text...")
|
| |
|
| |
|
| | try:
|
| | p2t = Pix2Text.from_config()
|
| | logger.info("✅ Pix2Text initialized with default config")
|
| | return p2t
|
| | except Exception as e1:
|
| | logger.warning(f"Default Pix2Text init failed: {e1}")
|
| |
|
| |
|
| | try:
|
| | p2t = Pix2Text()
|
| | logger.info("✅ Pix2Text initialized with basic constructor")
|
| | return p2t
|
| | except Exception as e2:
|
| | logger.warning(f"Basic Pix2Text init failed: {e2}")
|
| |
|
| |
|
| | try:
|
| | config = {"device": "cpu"}
|
| | p2t = Pix2Text.from_config(config)
|
| | logger.info("✅ Pix2Text initialized with CPU config")
|
| | return p2t
|
| | except Exception as e3:
|
| | logger.error(f"All Pix2Text initialization methods failed: {e3}")
|
| |
|
| | return None
|
| |
|
| | except Exception as e:
|
| | logger.error(f"❌ Failed to initialize Pix2Text: {e}")
|
| | return None
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def preprocess_image_advanced(pil_image):
|
| | """Enhanced image preprocessing with multiple techniques."""
|
| | img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
| | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| |
|
| |
|
| | gray = cv2.fastNlMeansDenoising(gray, h=15)
|
| |
|
| |
|
| | binary = cv2.adaptiveThreshold(
|
| | gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 5
|
| | )
|
| |
|
| |
|
| | enhanced = cv2.convertScaleAbs(binary, alpha=1.2, beta=10)
|
| |
|
| |
|
| | height, width = enhanced.shape
|
| | scaled = cv2.resize(
|
| | enhanced, (width * 2, height * 2), interpolation=cv2.INTER_CUBIC
|
| | )
|
| |
|
| | return scaled
|
| |
|
| |
|
| | def preprocess_for_pix2text(pil_image, region):
|
| | """
|
| | Special preprocessing for Pix2Text mathematical expression extraction.
|
| | """
|
| |
|
| | img = np.array(pil_image)
|
| |
|
| |
|
| | x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
| |
|
| |
|
| | if w <= 0 or h <= 0:
|
| | logger.warning(f"Invalid region dimensions: w={w}, h={h}. Skipping Pix2Text.")
|
| | return None
|
| |
|
| |
|
| | padding = 10
|
| | x_start = max(0, x - padding)
|
| | y_start = max(0, y - padding)
|
| | x_end = min(img.shape[1], x + w + padding)
|
| | y_end = min(img.shape[0], y + h + padding)
|
| |
|
| |
|
| | if x_end <= x_start or y_end <= y_start:
|
| | logger.warning(
|
| | f"Invalid crop bounds: x({x_start}:{x_end}), y({y_start}:{y_end}). Skipping Pix2Text."
|
| | )
|
| | return None
|
| |
|
| | cropped = img[y_start:y_end, x_start:x_end]
|
| |
|
| |
|
| | if cropped.size == 0:
|
| | logger.warning("Cropped image is empty. Skipping Pix2Text.")
|
| | return None
|
| |
|
| |
|
| | try:
|
| | cropped_pil = Image.fromarray(cropped)
|
| | except Exception as e:
|
| | logger.error(f"Failed to create PIL image from cropped array: {e}")
|
| | return None
|
| |
|
| |
|
| | min_size = 32
|
| | if cropped_pil.width <= 0 or cropped_pil.height <= 0:
|
| | logger.warning(
|
| | f"Invalid PIL image dimensions: {cropped_pil.width}x{cropped_pil.height}"
|
| | )
|
| | return None
|
| |
|
| | if cropped_pil.width < min_size or cropped_pil.height < min_size:
|
| |
|
| | try:
|
| | ratio = max(min_size / cropped_pil.width, min_size / cropped_pil.height)
|
| | new_width = int(cropped_pil.width * ratio)
|
| | new_height = int(cropped_pil.height * ratio)
|
| |
|
| |
|
| | if new_width <= 0 or new_height <= 0:
|
| | logger.warning(f"Invalid resized dimensions: {new_width}x{new_height}")
|
| | return None
|
| |
|
| | cropped_pil = cropped_pil.resize((new_width, new_height), Image.LANCZOS)
|
| | except Exception as e:
|
| | logger.error(f"Failed to resize image: {e}")
|
| | return None
|
| |
|
| | return cropped_pil
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def detect_text_regions(image):
|
| | """Detect text regions and classify them by line and character type."""
|
| | data = pytesseract.image_to_data(image, output_type=Output.DICT, lang="eng+ben")
|
| |
|
| | text_regions = []
|
| | for i in range(len(data["text"])):
|
| | text = data["text"][i].strip()
|
| | if text and int(data["conf"][i]) > 25:
|
| |
|
| | width = int(data["width"][i])
|
| | height = int(data["height"][i])
|
| | left = int(data["left"][i])
|
| | top = int(data["top"][i])
|
| |
|
| |
|
| | if width <= 0 or height <= 0:
|
| | logger.debug(
|
| | f"Skipping region with invalid dimensions: {width}x{height}"
|
| | )
|
| | continue
|
| |
|
| |
|
| | if width < 3 or height < 3:
|
| | logger.debug(f"Skipping tiny region: {width}x{height}")
|
| | continue
|
| |
|
| | region = {
|
| | "text": text,
|
| | "left": left,
|
| | "top": top,
|
| | "width": width,
|
| | "height": height,
|
| | "confidence": int(data["conf"][i]),
|
| | "type": classify_text_region(text),
|
| | }
|
| | text_regions.append(region)
|
| |
|
| | logger.info(f"Detected {len(text_regions)} valid text regions")
|
| | return text_regions
|
| |
|
| |
|
| | def group_regions_by_line(regions, line_tolerance=15):
|
| | """Group text regions into lines with better tolerance for math expressions."""
|
| | if not regions:
|
| | return []
|
| |
|
| | regions_sorted = sorted(regions, key=lambda x: x["top"])
|
| |
|
| | lines = []
|
| | current_line = [regions_sorted[0]]
|
| | current_top = regions_sorted[0]["top"]
|
| |
|
| | for region in regions_sorted[1:]:
|
| |
|
| |
|
| | current_height = max(1, current_line[0]["height"])
|
| | region_height = max(1, region["height"])
|
| | height_avg = (current_height + region_height) / 2
|
| | tolerance = max(line_tolerance, height_avg * 0.3)
|
| |
|
| | if abs(region["top"] - current_top) <= tolerance:
|
| | current_line.append(region)
|
| | else:
|
| | current_line.sort(key=lambda x: x["left"])
|
| | lines.append(current_line)
|
| | current_line = [region]
|
| | current_top = region["top"]
|
| |
|
| | if current_line:
|
| | current_line.sort(key=lambda x: x["left"])
|
| | lines.append(current_line)
|
| |
|
| | return lines
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def extract_english_region(image, region):
|
| | """Extract English text from a specific region with optimized settings."""
|
| | x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
| |
|
| | roi = image[y : y + h, x : x + w]
|
| | if roi.size == 0:
|
| | return region["text"]
|
| |
|
| | config = r"--oem 3 --psm 8 -l eng"
|
| | try:
|
| | result = pytesseract.image_to_string(roi, config=config).strip()
|
| | return result if result else region["text"]
|
| | except Exception:
|
| | return region["text"]
|
| |
|
| |
|
| | def extract_bangla_region(image, region):
|
| | """Extract Bangla text from a specific region with optimized settings."""
|
| | x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
| |
|
| | roi = image[y : y + h, x : x + w]
|
| | if roi.size == 0:
|
| | return region["text"]
|
| |
|
| | config = r"--oem 3 --psm 8 -l ben"
|
| | try:
|
| | result = pytesseract.image_to_string(roi, config=config).strip()
|
| | return result if result else region["text"]
|
| | except Exception:
|
| | return region["text"]
|
| |
|
| |
|
| | def extract_math_region_pix2text(pil_image, region, p2t_model):
|
| | """
|
| | Extract mathematical expressions using Pix2Text with fallback to traditional OCR.
|
| | """
|
| | if not p2t_model:
|
| | return extract_math_region_traditional(pil_image, region)
|
| |
|
| | try:
|
| |
|
| | math_image = preprocess_for_pix2text(pil_image, region)
|
| |
|
| |
|
| | if math_image is None:
|
| | logger.warning(
|
| | "Pix2Text preprocessing failed, falling back to traditional OCR"
|
| | )
|
| | return extract_math_region_traditional(pil_image, region)
|
| |
|
| |
|
| | result = p2t_model(math_image)
|
| |
|
| |
|
| | extracted_text = parse_pix2text_result(result)
|
| |
|
| | if extracted_text and extracted_text.strip():
|
| |
|
| | if not is_valid_pix2text_result(extracted_text):
|
| | logger.warning(f"Invalid Pix2Text result: {extracted_text[:100]}...")
|
| | return extract_math_region_traditional(pil_image, region)
|
| |
|
| | logger.info(f"✅ Pix2Text extracted: {extracted_text[:50]}...")
|
| | return extracted_text.strip()
|
| | else:
|
| | logger.warning(
|
| | "⚠️ Pix2Text returned empty result, falling back to traditional OCR"
|
| | )
|
| | return extract_math_region_traditional(pil_image, region)
|
| |
|
| | except Exception as e:
|
| | logger.error(f"❌ Pix2Text extraction failed: {e}")
|
| | return extract_math_region_traditional(pil_image, region)
|
| |
|
| |
|
| | def parse_pix2text_result(result):
|
| | """
|
| | Parse Pix2Text result handling various response formats.
|
| | """
|
| | try:
|
| | if isinstance(result, dict):
|
| |
|
| |
|
| | for key in ["text", "formula", "latex", "content", "output"]:
|
| | if key in result and result[key]:
|
| | return str(result[key])
|
| |
|
| |
|
| |
|
| | result_str = str(result)
|
| | if len(result_str) > 1000:
|
| | return ""
|
| | return result_str
|
| |
|
| | elif isinstance(result, list):
|
| |
|
| | if not result:
|
| | return ""
|
| |
|
| |
|
| | valid_items = []
|
| | for item in result:
|
| | item_str = str(item).strip()
|
| | if item_str and not is_debug_content(item_str):
|
| | valid_items.append(item_str)
|
| |
|
| | return " ".join(valid_items)
|
| |
|
| | elif isinstance(result, str):
|
| | return result
|
| | else:
|
| | return str(result)
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Error parsing Pix2Text result: {e}")
|
| | return ""
|
| |
|
| |
|
| | def is_valid_pix2text_result(text):
|
| | """
|
| | Check if the Pix2Text result is valid mathematical content.
|
| | """
|
| | if not text or not text.strip():
|
| | return False
|
| |
|
| | text = text.strip()
|
| |
|
| |
|
| | invalid_patterns = [
|
| | "Page(id=",
|
| | "elements=[]",
|
| | "number=0",
|
| | "Error:",
|
| | "Exception:",
|
| | "Traceback:",
|
| | "DEBUG:",
|
| | "INFO:",
|
| | "WARNING:",
|
| | "ERROR:",
|
| | ]
|
| |
|
| | for pattern in invalid_patterns:
|
| | if pattern in text:
|
| | return False
|
| |
|
| |
|
| | if len(text) < 1:
|
| | return False
|
| |
|
| |
|
| |
|
| | import re
|
| |
|
| | if re.search(r"[a-zA-Z0-9=+\-*/(){}[\]^_√∫∑∂πθαβγδλμΩ]", text):
|
| | return True
|
| |
|
| | return False
|
| |
|
| |
|
| | def is_debug_content(text):
|
| | """
|
| | Check if text appears to be debug/logging content rather than actual content.
|
| | """
|
| | debug_indicators = [
|
| | "Page(",
|
| | "id=",
|
| | "number=",
|
| | "elements=",
|
| | "[])",
|
| | "DEBUG",
|
| | "INFO",
|
| | "WARNING",
|
| | "ERROR",
|
| | "Exception",
|
| | "Traceback",
|
| | 'File "',
|
| | "line ",
|
| | " at 0x",
|
| | ]
|
| |
|
| | for indicator in debug_indicators:
|
| | if indicator in text:
|
| | return True
|
| |
|
| | return False
|
| |
|
| |
|
| | def extract_math_region_traditional(pil_image, region):
|
| | """
|
| | Fallback traditional OCR for mathematical expressions.
|
| | """
|
| |
|
| | img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
| | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| |
|
| | x, y, w, h = region["left"], region["top"], region["width"], region["height"]
|
| | roi = gray[y : y + h, x : x + w]
|
| |
|
| | if roi.size == 0:
|
| | return region["text"]
|
| |
|
| |
|
| | math_chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=+-×÷∑∫√π∞∂→≤≥∝∴∵∠∆∇()[]{}.,;:^_αβγδλμθΩ±≈≠≡⇒⇔"
|
| | config = f"--oem 3 --psm 6 -c tessedit_char_whitelist={math_chars}"
|
| |
|
| | try:
|
| | result = pytesseract.image_to_string(roi, config=config).strip()
|
| | return result if result else region["text"]
|
| | except Exception:
|
| | return region["text"]
|
| |
|
| |
|
| | def extract_mixed_region(pil_image, region, p2t_model):
|
| | """Extract mixed content using multiple approaches."""
|
| |
|
| | img = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
| | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| |
|
| | eng_result = extract_english_region(gray, region)
|
| | bangla_result = extract_bangla_region(gray, region)
|
| |
|
| |
|
| | if has_math_patterns(region["text"]):
|
| | math_result = extract_math_region_pix2text(pil_image, region, p2t_model)
|
| |
|
| | results = [r for r in [eng_result, bangla_result, math_result] if r.strip()]
|
| | return max(results, key=len) if results else region["text"]
|
| |
|
| |
|
| | return bangla_result if len(bangla_result) > len(eng_result) else eng_result
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def analyze_character_by_character(text):
|
| | """Analyze text character by character to identify language patterns."""
|
| | analysis = {
|
| | "characters": [],
|
| | "language_segments": [],
|
| | "total_chars": len(text),
|
| | "language_distribution": defaultdict(int),
|
| | }
|
| |
|
| | for i, char in enumerate(text):
|
| | char_type = classify_character(char)
|
| | analysis["characters"].append(
|
| | {
|
| | "char": char,
|
| | "position": i,
|
| | "type": char_type,
|
| | "unicode_name": unicodedata.name(char, "UNKNOWN"),
|
| | }
|
| | )
|
| | analysis["language_distribution"][char_type] += 1
|
| |
|
| |
|
| | current_segment = None
|
| | for char_info in analysis["characters"]:
|
| | if char_info["type"] in ["space", "punctuation"]:
|
| | continue
|
| |
|
| | if current_segment is None or current_segment["type"] != char_info["type"]:
|
| | if current_segment:
|
| | analysis["language_segments"].append(current_segment)
|
| | current_segment = {
|
| | "type": char_info["type"],
|
| | "start": char_info["position"],
|
| | "end": char_info["position"],
|
| | "text": char_info["char"],
|
| | }
|
| | else:
|
| | current_segment["end"] = char_info["position"]
|
| | current_segment["text"] += char_info["char"]
|
| |
|
| | if current_segment:
|
| | analysis["language_segments"].append(current_segment)
|
| |
|
| | return analysis
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def process_page_advanced(page_image, page_num, p2t_model):
|
| | """
|
| | Advanced page processing with Pix2Text integration.
|
| | """
|
| | print(f"Processing page {page_num + 1}...")
|
| |
|
| |
|
| | processed_image = preprocess_image_advanced(page_image)
|
| |
|
| |
|
| | regions = detect_text_regions(processed_image)
|
| |
|
| |
|
| | lines = group_regions_by_line(regions)
|
| |
|
| | page_results = []
|
| |
|
| | for line_num, line in enumerate(lines):
|
| | line_text_parts = []
|
| |
|
| | for region in line:
|
| |
|
| | if region["type"] == "english":
|
| | extracted_text = extract_english_region(processed_image, region)
|
| | elif region["type"] == "bangla":
|
| | extracted_text = extract_bangla_region(processed_image, region)
|
| | elif region["type"] == "math":
|
| | extracted_text = extract_math_region_pix2text(
|
| | page_image, region, p2t_model
|
| | )
|
| | elif region["type"] == "mixed":
|
| | extracted_text = extract_mixed_region(page_image, region, p2t_model)
|
| | else:
|
| | extracted_text = region["text"]
|
| |
|
| |
|
| | char_analysis = analyze_character_by_character(extracted_text)
|
| |
|
| | region_result = {
|
| | "page": page_num,
|
| | "line": line_num,
|
| | "text": extracted_text,
|
| | "original_text": region["text"],
|
| | "position": {
|
| | "left": region["left"],
|
| | "top": region["top"],
|
| | "width": region["width"],
|
| | "height": region["height"],
|
| | },
|
| | "confidence": region["confidence"],
|
| | "detected_type": region["type"],
|
| | "extraction_method": "pix2text"
|
| | if region["type"] == "math" and p2t_model
|
| | else "tesseract",
|
| | "character_analysis": char_analysis,
|
| | }
|
| |
|
| | page_results.append(region_result)
|
| | line_text_parts.append(extracted_text)
|
| |
|
| |
|
| | if line_text_parts:
|
| | line_text = " ".join(line_text_parts)
|
| | print(f" Line {line_num + 1}: {line_text[:100]}...")
|
| |
|
| | return page_results
|
| |
|
| |
|
| | def extract_all_text_advanced_pix2text(
|
| | pdf_path, output_text_file, output_json_file, output_analysis_file
|
| | ):
|
| | """
|
| | Advanced text extraction with Pix2Text integration.
|
| | """
|
| | print("[INFO] Initializing Pix2Text for mathematical expression extraction...")
|
| | p2t_model = initialize_pix2text()
|
| |
|
| | if p2t_model:
|
| | print("✅ Pix2Text ready for advanced math extraction")
|
| | else:
|
| | print("⚠️ Using traditional OCR for math expressions")
|
| |
|
| | print("[INFO] Converting PDF to images...")
|
| | pages = convert_from_path(pdf_path, dpi=300)
|
| |
|
| | all_results = []
|
| | combined_text_parts = []
|
| |
|
| | for page_num, page_image in enumerate(tqdm(pages, desc="Processing pages")):
|
| | page_results = process_page_advanced(page_image, page_num, p2t_model)
|
| | all_results.extend(page_results)
|
| |
|
| |
|
| | page_text_parts = [result["text"] for result in page_results]
|
| | page_text = " ".join(page_text_parts)
|
| | combined_text_parts.append(page_text)
|
| |
|
| |
|
| | final_text = "\n\n".join(combined_text_parts)
|
| |
|
| |
|
| | with open(output_text_file, "w", encoding="utf-8") as f:
|
| | f.write(final_text)
|
| |
|
| |
|
| | with open(output_json_file, "w", encoding="utf-8") as f:
|
| | json.dump(all_results, f, ensure_ascii=False, indent=2)
|
| |
|
| |
|
| | summary_analysis = create_extraction_summary(all_results)
|
| | with open(output_analysis_file, "w", encoding="utf-8") as f:
|
| | json.dump(summary_analysis, f, ensure_ascii=False, indent=2)
|
| |
|
| | print("\n[✅] Advanced Pix2Text extraction complete!")
|
| | print(f"→ Text file saved to: {output_text_file}")
|
| | print(f"→ Detailed JSON saved to: {output_json_file}")
|
| | print(f"→ Analysis report saved to: {output_analysis_file}")
|
| |
|
| |
|
| | print("\n📊 Extraction Summary:")
|
| | print(f" Total text regions: {len(all_results)}")
|
| | print(f" English regions: {summary_analysis['type_distribution']['english']}")
|
| | print(f" Bangla regions: {summary_analysis['type_distribution']['bangla']}")
|
| | print(f" Math regions: {summary_analysis['type_distribution']['math']}")
|
| | print(f" Mixed regions: {summary_analysis['type_distribution']['mixed']}")
|
| |
|
| |
|
| | method_stats = defaultdict(int)
|
| | for result in all_results:
|
| | method_stats[result.get("extraction_method", "unknown")] += 1
|
| |
|
| | print("\n🔧 Extraction Methods Used:")
|
| | for method, count in method_stats.items():
|
| | print(f" {method}: {count} regions")
|
| |
|
| |
|
| | def create_extraction_summary(results):
|
| | """Create a comprehensive summary of the extraction results."""
|
| | summary = {
|
| | "total_regions": len(results),
|
| | "total_pages": len(set(r["page"] for r in results)),
|
| | "type_distribution": defaultdict(int),
|
| | "character_distribution": defaultdict(int),
|
| | "confidence_stats": {"min": 100, "max": 0, "avg": 0},
|
| | "language_segments_summary": defaultdict(int),
|
| | "extraction_methods": defaultdict(int),
|
| | }
|
| |
|
| | total_confidence = 0
|
| | for result in results:
|
| | summary["type_distribution"][result["detected_type"]] += 1
|
| | summary["extraction_methods"][result.get("extraction_method", "unknown")] += 1
|
| |
|
| | conf = result["confidence"]
|
| | total_confidence += conf
|
| | summary["confidence_stats"]["min"] = min(
|
| | summary["confidence_stats"]["min"], conf
|
| | )
|
| | summary["confidence_stats"]["max"] = max(
|
| | summary["confidence_stats"]["max"], conf
|
| | )
|
| |
|
| |
|
| | char_analysis = result["character_analysis"]
|
| | for char_type, count in char_analysis["language_distribution"].items():
|
| | summary["character_distribution"][char_type] += count
|
| |
|
| |
|
| | for segment in char_analysis["language_segments"]:
|
| | summary["language_segments_summary"][segment["type"]] += 1
|
| |
|
| | if results:
|
| | summary["confidence_stats"]["avg"] = total_confidence / len(results)
|
| |
|
| | return summary
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | pdf_path = r"math102.pdf"
|
| | output_text_file = "math102_pix2text.txt"
|
| | output_json_file = "math102_pix2text.json"
|
| | output_analysis_file = "math102_pix2text_analysis.json"
|
| |
|
| | extract_all_text_advanced_pix2text(
|
| | pdf_path, output_text_file, output_json_file, output_analysis_file
|
| | )
|
| |
|