Spaces:

Arsh124
/

RenAI

Sleeping

App Files Files Community

Arsh124 commited on Oct 17, 2025

Commit

ebcc7d1

1 Parent(s): 6972ce0

Initial RenAI app

Browse files

Files changed (21) hide show

Dockerfile +34 -0
__pycache__/configs.cpython-312.pyc +0 -0
__pycache__/inference.cpython-312.pyc +0 -0
__pycache__/vit.cpython-312.pyc +0 -0
app.py +323 -0
inference.py +255 -0
main.py +65 -0
requirements.txt +233 -0
utils/__pycache__/configs.cpython-312.pyc +0 -0
utils/__pycache__/helper.cpython-312.pyc +0 -0
utils/__pycache__/inference.cpython-312.pyc +0 -0
utils/__pycache__/line_segmentation.cpython-312.pyc +0 -0
utils/__pycache__/postprocessing.cpython-312.pyc +0 -0
utils/__pycache__/preprocessing.cpython-312.pyc +0 -0
utils/__pycache__/vit.cpython-312.pyc +0 -0
utils/helper.py +10 -0
utils/line_segmentation.py +327 -0
utils/postprocessing.py +353 -0
utils/preprocessing.py +202 -0
utils/unet.py +64 -0
vit.py +111 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM python:3.12-slim
+RUN useradd user
+USER user
+ENV HOME=/home/user \
+    PATH="/home/user/.local/bin:$PATH" \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+WORKDIR $HOME/app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    libgtk-3-0 \
+    libavcodec-dev \
+    libavformat-dev \
+    libswscale-dev \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir --timeout=100 -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host=0.0.0.0", "--port=7860"]

__pycache__/configs.cpython-312.pyc ADDED Viewed

Binary file (284 Bytes). View file

__pycache__/inference.cpython-312.pyc ADDED Viewed

Binary file (11.4 kB). View file

__pycache__/vit.cpython-312.pyc ADDED Viewed

Binary file (7.01 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from typing import Optional, Any, Dict, Union
+import shutil
+import os
+import json
+from loguru import logger
+from pathlib import Path
+from main import RenAITranscription
+import tempfile
+import numpy as np
+from datetime import datetime
+import base64
+from io import BytesIO
+from PIL import Image
+app = FastAPI(title="RenAI Transcription API", version="1.0.0")
+# Add CORS middleware
+# app.add_middleware(
+#     CORSMiddleware,
+#     allow_origins=["*"],
+#     allow_credentials=True,
+#     allow_methods=["*"],
+#     allow_headers=["*"],
+# )
+ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+def numpy_to_base64(array: np.ndarray, format: str = 'PNG', quality: int = 85) -> str:
+    """
+    Convert numpy array (image) to base64 encoded string for web display.
+    Args:
+        array: Numpy array representing the image
+        format: Image format ('PNG' or 'JPEG')
+        quality: JPEG quality (1-100), only used if format is JPEG
+    Returns:
+        Data URI string that can be directly used in HTML <img> src attribute
+    """
+    try:
+        # Convert numpy array to PIL Image
+        img = Image.fromarray(array)
+        # Save to bytes buffer
+        buffer = BytesIO()
+        if format.upper() == 'JPEG':
+            # Convert to RGB if needed (JPEG doesn't support transparency)
+            if img.mode in ('RGBA', 'LA', 'P'):
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
+                img = background
+            img.save(buffer, format='JPEG', quality=quality, optimize=True)
+            mime_type = 'image/jpeg'
+        else:
+            img.save(buffer, format='PNG', optimize=True)
+            mime_type = 'image/png'
+        # Encode to base64
+        img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
+        return f"data:{mime_type};base64,{img_str}"
+    except Exception as e:
+        logger.error(f"Error converting numpy array to base64: {e}")
+        return None
+def format_transcription_result(result: Dict, include_images: bool = False, image_format: str = 'PNG') -> Dict[str, Any]:
+    """
+    Format transcription result into a structured response.
+    Args:
+        result: Dictionary with line IDs as keys, each containing 'image' and 'transcription'
+        include_images: Whether to include base64 encoded images in response
+        image_format: Image format for base64 encoding ('PNG' or 'JPEG')
+    Returns:
+        Formatted dictionary with transcription data
+    """
+    formatted_lines = {}
+    transcription_text = []
+    for line_id, line_data in result.items():
+        formatted_line = {
+            'line_id': line_id,
+            'transcription': line_data.get('transcription', '')
+        }
+        # Optionally include image as base64 (web-ready format)
+        if include_images and 'image' in line_data:
+            image_array = line_data['image']
+            if isinstance(image_array, np.ndarray):
+                image_base64 = numpy_to_base64(image_array, format=image_format)
+                if image_base64:
+                    formatted_line['image'] = image_base64
+        formatted_lines[line_id] = formatted_line
+        transcription_text.append(f"{line_id}: {line_data.get('transcription', '')}")
+    return {
+        'lines': formatted_lines,
+        'full_text': '\n'.join(transcription_text),
+        'total_lines': len(result)
+    }
+@app.get("/")
+def home():
+    return {
+        "message": "Hello, RenAI!",
+        "version": "1.0.0",
+        "endpoints": {
+            "transcribe": "/renai-transcribe (POST)",
+            "transcribe_base64": "/renai-transcribe-base64 (POST)",
+            "health": "/health (GET)"
+        }
+    }
+@app.post("/renai-transcribe")
+async def transcription_endpoint(
+    image: UploadFile = File(..., description="Image file to transcribe"),
+    userToken: Optional[str] = Form(None, description="User authentication token"),
+    post_processing_enabled: bool = Form(False, description="Enable post-processing"),
+    unet_enabled: bool = Form(False, description="Enable UNet processing"),
+    include_images: bool = Form(True, description="Include base64 encoded line images in response"),
+    image_format: str = Form("JPEG", description="Image format for line images: PNG or JPEG")
+):
+    """
+    Upload an image file and get transcription results.
+    - **image**: Image file (JPG, PNG, BMP, TIFF, WebP)
+    - **userToken**: Optional user authentication token
+    - **post_processing_enabled**: Enable/disable post-processing
+    - **unet_enabled**: Enable/disable UNet processing
+    - **include_images**: Include base64 encoded images of each line (web-ready format)
+    - **image_format**: Format for line images: 'PNG' (higher quality, larger) or 'JPEG' (smaller, faster)
+    """
+    start_time = datetime.now()
+    logger.info(f"Transcription request received for file: {image.filename} by userToken: {userToken if userToken else 'Anonymous'}")
+    # Validate file type
+    if not image.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+    file_extension = Path(image.filename).suffix.lower()
+    if file_extension not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid file type. Allowed types: {', '.join(ALLOWED_EXTENSIONS)}"
+        )
+    # Check file size
+    if image.size and image.size > MAX_FILE_SIZE:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
+        )
+    temp_file_path = None
+    try:
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+            shutil.copyfileobj(image.file, temp_file)
+            temp_file_path = temp_file.name
+        logger.info(f"Processing image: {temp_file_path}")
+        # Call transcription function
+        result = RenAITranscription(
+            image=temp_file_path,
+            post_processing_enabled=post_processing_enabled,
+            unet_enabled=unet_enabled
+        )
+        logger.info(f"Transcription completed. Result type: {type(result)}, Lines: {len(result)}")
+        # Format the result
+        formatted_result = format_transcription_result(result, include_images=include_images, image_format=image_format)
+        # Clean up
+        os.unlink(temp_file_path)
+        processing_time = (datetime.now() - start_time).total_seconds()
+        logger.info(f"Request completed in {processing_time:.2f}s")
+        response_data = {
+            "success": True,
+            "filename": image.filename,
+            "transcription": formatted_result,
+            "metadata": {
+                "processing_time_seconds": round(processing_time, 2),
+                "timestamp": datetime.now().isoformat(),
+                "total_lines": formatted_result['total_lines'],
+                "parameters": {
+                    "post_processing_enabled": post_processing_enabled,
+                    "unet_enabled": unet_enabled,
+                    "include_images": include_images,
+                    "userToken": userToken if userToken else "Anonymous"
+                }
+            }
+        }
+        return JSONResponse(content=response_data)
+    except Exception as e:
+        # Clean up
+        if temp_file_path and os.path.exists(temp_file_path):
+            try:
+                os.unlink(temp_file_path)
+            except:
+                pass
+        logger.error(f"Transcription failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "type": type(e).__name__
+            }
+        )
+@app.post("/renai-transcribe-base64")
+async def transcription_base64_endpoint(
+    image_data: str = Form(..., description="Base64 encoded image data"),
+    userToken: Optional[str] = Form(None, description="User authentication token"),
+    post_processing_enabled: bool = Form(False, description="Enable post-processing"),
+    unet_enabled: bool = Form(False, description="Enable UNet processing"),
+    include_images: bool = Form(False, description="Include base64 encoded line images in response"),
+    image_format: str = Form("JPEG", description="Image format for line images: PNG or JPEG")
+):
+    """
+    Alternative endpoint that accepts base64 encoded image data.
+    """
+    import base64
+    import io
+    from PIL import Image
+    start_time = datetime.now()
+    logger.info(f"Base64 transcription request received by userToken: {userToken if userToken else 'Anonymous'}")
+    temp_file_path = None
+    try:
+        # Remove data URL prefix if present
+        if "," in image_data:
+            image_data = image_data.split(",", 1)[1]
+        # Decode base64 image
+        image_bytes = base64.b64decode(image_data)
+        image_pil = Image.open(io.BytesIO(image_bytes))
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
+            image_pil.save(temp_file.name)
+            temp_file_path = temp_file.name
+        logger.info(f"Processing base64 image: {temp_file_path}")
+        # Call transcription function
+        result = RenAITranscription(
+            image=temp_file_path,
+            post_processing_enabled=post_processing_enabled,
+            unet_enabled=unet_enabled
+        )
+        # Format the result
+        formatted_result = format_transcription_result(result, include_images=include_images, image_format=image_format)
+        # Clean up
+        os.unlink(temp_file_path)
+        processing_time = (datetime.now() - start_time).total_seconds()
+        logger.info(f"Base64 request completed in {processing_time:.2f}s")
+        response_data = {
+            "success": True,
+            "transcription": formatted_result,
+            "metadata": {
+                "processing_time_seconds": round(processing_time, 2),
+                "timestamp": datetime.now().isoformat(),
+                "total_lines": formatted_result['total_lines'],
+                "parameters": {
+                    "post_processing_enabled": post_processing_enabled,
+                    "unet_enabled": unet_enabled,
+                    "include_images": include_images,
+                    "image_format": image_format if include_images else None,
+                    "userToken": userToken if userToken else "Anonymous"
+                }
+            }
+        }
+        return JSONResponse(content=response_data)
+    except Exception as e:
+        if temp_file_path and os.path.exists(temp_file_path):
+            try:
+                os.unlink(temp_file_path)
+            except:
+                pass
+        logger.error(f"Base64 transcription failed: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "type": type(e).__name__
+            }
+        )
+@app.get("/health")
+def health_check():
+    try:
+        return {
+            "status": "healthy",
+            "service": "RenAI Transcription API",
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        raise HTTPException(status_code=500, detail="Service unhealthy")

inference.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import torch
+from torch.utils.data import DataLoader
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import cv2
+import string
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from vit import LineDataset, collate_fn
+from loguru import logger
+import os
+class Inference:
+    def __init__(self, model_path, processor_path, target_size=(256, 64), batch_size=32):
+        """
+        Initialize the TextGenerator with model and processor paths.
+        Args:
+            model_path (str): Path to the pre-trained model
+            processor_path (str): Path to the pre-trained processor
+            target_size (tuple): Target size for input images (height, width)
+            batch_size (int): Batch size for inference
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model_path = self._get_absolute_path(model_path)
+        self.processor_path = self._get_absolute_path(processor_path)
+        self.target_size = target_size
+        self.batch_size = batch_size
+        # Initialize model and processor
+        self.processor = None
+        self.model = None
+        self._initialize_model()
+    def _get_absolute_path(self, path):
+        """Convert relative path to absolute path"""
+        if os.path.isabs(path):
+            return path
+        # If it's a relative path, make it absolute relative to the current working directory
+        return os.path.join(os.getcwd(), path.lstrip('./'))
+    def _initialize_model(self):
+        """Load and initialize the model and processor."""
+        logger.info("Loading model...")
+        # Check if paths exist
+        if not os.path.exists(self.model_path):
+            raise FileNotFoundError(f"Model path not found: {self.model_path}")
+        if not os.path.exists(self.processor_path):
+            raise FileNotFoundError(f"Processor path not found: {self.processor_path}")
+        # List all files in the model directory
+        all_files = os.listdir(self.model_path)
+        # Validate that we have the necessary files
+        if not any(f in all_files for f in ['pytorch_model.bin', 'model.safetensors']):
+            logger.error("No model weights file found! (pytorch_model.bin or model.safetensors)")
+            raise FileNotFoundError("Model weights file missing")
+        if 'config.json' not in all_files:
+            logger.error("config.json file not found!")
+            raise FileNotFoundError("config.json missing")
+        logger.info(f"Loading model from: {self.model_path}")
+        logger.info(f"Loading processor from: {self.processor_path}")
+        try:
+            # Load processor
+            self.processor = TrOCRProcessor.from_pretrained(self.processor_path, do_rescale=False, use_fast=True)
+            logger.info("Processor loaded successfully")
+            # Try different loading methods for the model
+            logger.info("Attempting to load model...")
+            # Method 1: Try with explicit device mapping
+            try:
+                self.model = VisionEncoderDecoderModel.from_pretrained(
+                    self.model_path,
+                    use_safetensors=True,
+                    device_map="auto" if torch.cuda.is_available() else None
+                )
+                logger.info("Model loaded with safetensors=True and device_map")
+            except Exception as e1:
+                logger.warning(f"Method 1 failed: {e1}")
+                # Method 2: Try without device mapping
+                try:
+                    self.model = VisionEncoderDecoderModel.from_pretrained(
+                        self.model_path,
+                        use_safetensors=True
+                    )
+                    logger.info("Model loaded with safetensors=True")
+                except Exception as e2:
+                    logger.warning(f"Method 2 failed: {e2}")
+                    # Method 3: Try without safetensors
+                    try:
+                        self.model = VisionEncoderDecoderModel.from_pretrained(
+                            self.model_path,
+                            use_safetensors=True
+                        )
+                        logger.info("Model loaded with safetensors=False")
+                    except Exception as e3:
+                        logger.error(f"All loading methods failed: {e3}")
+                        raise
+            # Move model to device if not already done by device_map
+            if not hasattr(self.model, 'device') or str(self.model.device) != str(self.device):
+                logger.info(f"Moving model to device: {self.device}")
+                self.model.to(self.device)
+            self.model.eval()
+            logger.info("Model loaded successfully and moved to device")
+        except Exception as e:
+            logger.error(f"Error loading model or processor: {e}")
+            import traceback
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            raise
+    def preprocess_images(self, line_segments):
+        """
+        Prepare line images for inference.
+        Args:
+            line_segments (dict): Dictionary containing line segment information
+        Returns:
+            tuple: (keys, line_images) - keys and corresponding images
+        """
+        keys = list(line_segments.keys())
+        line_images = [line_segments[k]["image"] for k in keys]
+        return keys, line_images
+    def create_dataloader(self, line_images):
+        """
+        Create DataLoader for inference.
+        Args:
+            line_images (list): List of line images
+        Returns:
+            DataLoader: Configured DataLoader for inference
+        """
+        # Create dummy labels for inference
+        dummy_labels = [""] * len(line_images)
+        dataset = LineDataset(
+            self.processor,
+            self.model,
+            line_images,
+            dummy_labels,
+            self.target_size,
+            apply_augmentation=False
+        )
+        dataloader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            collate_fn=collate_fn
+        )
+        return dataloader
+    def generate_texts(self, dataloader):
+        """
+        Generate texts from images using the model.
+        Args:
+            dataloader (DataLoader): DataLoader containing preprocessed images
+        Returns:
+            list: List of generated texts
+        """
+        generated_texts = []
+        with torch.no_grad():
+            for batch in dataloader:
+                pixel_values = batch["pixel_values"].to(self.device)
+                generated_ids = self.model.generate(pixel_values)
+                generated_texts_batch = self.processor.batch_decode(
+                    generated_ids,
+                    skip_special_tokens=True
+                )
+                generated_texts.extend(generated_texts_batch)
+        return generated_texts
+    def update_line_segments(self, line_segments, keys, generated_texts):
+        """
+        Update line segments dictionary with generated transcriptions.
+        Args:
+            line_segments (dict): Original line segments dictionary
+            keys (list): List of keys corresponding to the line segments
+            generated_texts (list): List of generated texts
+        Returns:
+            dict: Updated line segments dictionary with transcriptions
+        """
+        for key, text in zip(keys, generated_texts):
+            line_segments[key]["transcription"] = text
+        return line_segments
+    def generate_texts_from_images(self, line_segments):
+        """
+        Main method to generate texts from line segment images.
+        Args:
+            line_segments (dict): Dictionary containing line segment information
+                with "image" key for each segment
+        Returns:
+            dict: Updated line segments dictionary with "transcription" key added
+        """
+        logger.info("Starting text generation from images...")
+        # Preprocess images
+        keys, line_images = self.preprocess_images(line_segments)
+        # Create dataloader
+        dataloader = self.create_dataloader(line_images)
+        # Generate texts
+        generated_texts = self.generate_texts(dataloader)
+        # Update line segments with transcriptions
+        updated_line_segments = self.update_line_segments(
+            line_segments, keys, generated_texts
+        )
+        return updated_line_segments
+    def generate_single_image(self, image):
+        """
+        Generate text from a single image.
+        Args:
+            image: PIL Image or numpy array
+        Returns:
+            str: Generated text
+        """
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        # Create a temporary line_segments-like structure
+        temp_segments = {"temp_key": {"image": image}}
+        # Use the main method
+        result = self.generate_texts_from_images(temp_segments)
+        return result["temp_key"]["transcription"]

main.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from skimage.io import imread, imsave
+from skimage.color import rgb2gray
+from PIL import Image
+import matplotlib.pyplot as plt
+import numpy as np
+from skimage.transform import resize
+from utils.preprocessing import preprocessImage, postProcessImage, process_segment_and_crop_image
+from utils.line_segmentation import segment_image_to_lines
+from configs import unet_enabled
+from utils.helper import load_images_from_json
+from inference import Inference
+from configs import model_path, processor_path, unet_model_path
+from utils.postprocessing import PostProcessing
+from loguru import logger
+def RenAITranscription(image, post_processing_enabled=False,unet_enabled=False):
+   # 1- preprocessing
+    org_img = imread(image)[: , : ,:]
+    logger.info(f'Image Dimensions : {org_img.shape[0]} x {org_img.shape[1]}')
+    intial_process_image = preprocessImage(org_img)
+    if unet_enabled:
+        logger.info("Masked based segmentation and cropping enabled...")
+        cropped_img = process_segment_and_crop_image(unet_model_path, org_img, intial_process_image, padding=10, min_contour_area=100)
+        processed_image = postProcessImage(cropped_img)
+        logger.info(f"Image cropped and Pre-processed successfully.....")
+    else:
+        logger.info("Image Preprocessing started......")
+        processed_image = postProcessImage(intial_process_image)
+        logger.info(f"Image Pre-processed successfully.....")
+    # 2 - Line segmentation Algorithm
+    line_segments = segment_image_to_lines(processed_image, base_key="line",ct=0)
+    # 3 - Model Inference
+    transciption_generator = Inference(
+            model_path=model_path,
+            processor_path=processor_path,
+            target_size=(256, 64),
+            batch_size=32
+    )
+    result = transciption_generator.generate_texts_from_images(line_segments)
+    # Generated texts
+    for key, value in result.items():
+        print(f"{key}: {value['transcription']}")
+    # 4 - Post processing
+    # Dictionary based fuzzy matching
+    if post_processing_enabled:
+        for key, value in result.items():
+            corrected = PostProcessing(value['transcription'])
+            result[key]['post_processed'] = corrected
+            print(f"{key}: {value['post_processed']}")
+    print(result)
+    logger.info("Transcription completed successfully!")
+    return result
+if __name__ == "__main__":
+    RenAITranscription("1.png", post_processing_enabled=False, unet_enabled=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,233 @@

+absl-py==2.1.0
+accelerate==1.5.1
+aiohappyeyeballs==2.5.0
+aiohttp==3.11.13
+aiosignal==1.3.2
+albucore==0.0.23
+albumentations==2.0.5
+annotated-types==0.7.0
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+astunparse==1.6.3
+async-lru==2.0.4
+attrs==25.1.0
+babel==2.17.0
+beautifulsoup4==4.13.3
+bleach==6.2.0
+blinker==1.9.0
+blis==1.2.0
+catalogue==2.0.10
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+click==8.1.8
+cloudpathlib==0.21.0
+colorama==0.4.6
+comm==0.2.2
+confection==0.1.5
+contourpy==1.3.1
+cycler==0.12.1
+cymem==2.0.11
+datasets==3.3.2
+datetime
+debugpy==1.8.13
+decorator==5.2.1
+defusedxml==0.7.1
+deskew
+dill==0.3.8
+editdistance==0.8.1
+einops==0.8.1
+evaluate==0.4.3
+executing==2.2.0
+fastapi
+fastjsonschema==2.21.1
+filelock==3.17.0
+Flask==3.1.0
+flatbuffers==25.2.10
+fonttools==4.56.0
+fqdn==1.5.1
+frozenlist==1.5.0
+fsspec==2024.12.0
+gast==0.6.0
+gensim==4.3.3
+google-pasta==0.2.0
+greenlet==3.1.1
+grpcio==1.71.0
+h11==0.14.0
+h5py==3.13.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.34.4
+idna==3.10
+imageio==2.37.0
+iniconfig==2.0.0
+inquirerpy==0.3.4
+ipykernel==6.29.5
+ipython==9.0.2
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.5
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.6
+jiwer==3.1.0
+joblib==1.4.2
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+keras==3.9.0
+kiwisolver==1.4.8
+langcodes==3.5.0
+language_data==1.3.0
+lazy_loader==0.4
+Levenshtein==0.27.1
+libclang==18.1.1
+loguru
+lxml==5.3.1
+marisa-trie==1.2.1
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.2
+ml_dtypes==0.5.1
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+murmurhash==1.0.12
+namex==0.0.8
+narwhals==1.30.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.3
+ninja==1.11.1.4
+nltk==3.9.1
+notebook==7.3.2
+notebook_shim==0.2.4
+numpy==1.26.4
+opencv-python==4.11.0.86
+opencv-python-headless
+opt_einsum==3.4.0
+optree==0.14.1
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pfzy==0.3.4
+pillow==11.1.0
+platformdirs==4.3.6
+plotly==6.0.0
+pluggy==1.5.0
+preshed==3.0.9
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+propcache==0.3.0
+protobuf==4.25.6
+psutil==7.0.0
+pure_eval==0.2.3
+pyarrow==19.0.1
+pycparser==2.22
+pydantic==2.10.6
+pydantic_core==2.27.2
+Pygments==2.19.1
+PyMuPDF==1.25.3
+pyparsing==3.2.1
+pytest==8.3.5
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-json-logger==3.3.0
+python-Levenshtein==0.27.1
+python-multipart
+pytz==2025.1
+# pywin32==309
+# pywinpty==2.0.15
+PyYAML==6.0.2
+pyzmq==26.2.1
+RapidFuzz==3.12.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.4
+rpds-py==0.23.1
+safetensors==0.5.3
+scikit-image==0.25.2
+scikit-learn==1.6.1
+scipy==1.13.1
+seaborn==0.13.2
+Send2Trash==1.8.3
+setuptools==75.8.0
+shellingham==1.5.4
+simsimd==6.2.1
+six==1.17.0
+smart-open==7.1.0
+sniffio==1.3.1
+soupsieve==2.6
+spacy==3.8.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+SQLAlchemy==2.0.38
+srsly==2.5.1
+stack-data==0.6.3
+stringzilla==3.12.3
+sympy==1.13.1
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensorflow==2.19.0
+# tensorflow-intel==2.16.1
+termcolor==2.5.0
+terminado==0.18.1
+tf_keras==2.19.0
+thinc==8.3.4
+threadpoolctl==3.5.0
+tifffile==2025.2.18
+tinycss2==1.4.0
+tokenizers==0.21.0
+torch==2.4.1
+torchaudio==2.4.1
+torchvision==0.19.1
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.49.0
+typer==0.15.2
+types-python-dateutil==2.9.0.20241206
+typing_extensions==4.12.2
+tzdata==2025.1
+uri-template==1.3.0
+urllib3==2.3.0
+uvicorn
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+Werkzeug==3.1.3
+wheel==0.45.1
+widgetsnbextension==4.0.13
+wrapt==1.17.2
+xxhash==3.5.0
+yarl==1.18.3

utils/__pycache__/configs.cpython-312.pyc ADDED Viewed

Binary file (331 Bytes). View file

utils/__pycache__/helper.cpython-312.pyc ADDED Viewed

Binary file (660 Bytes). View file

utils/__pycache__/inference.cpython-312.pyc ADDED Viewed

Binary file (7.26 kB). View file

utils/__pycache__/line_segmentation.cpython-312.pyc ADDED Viewed

Binary file (12.4 kB). View file

utils/__pycache__/postprocessing.cpython-312.pyc ADDED Viewed

Binary file (20.1 kB). View file

utils/__pycache__/preprocessing.cpython-312.pyc ADDED Viewed

Binary file (9.67 kB). View file

utils/__pycache__/vit.cpython-312.pyc ADDED Viewed

Binary file (13.2 kB). View file

utils/helper.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+def load_images_from_json(line_segments):
+    line_images = []
+    image_paths = []
+    for key, value in line_segments.items():
+        line_images.append(value["image"])
+        image_paths.append(value.get("image_path", f"{key}.png"))
+    return line_images, image_paths

utils/line_segmentation.py ADDED Viewed

	@@ -0,0 +1,327 @@

+from skimage.io import imread
+from skimage.color import rgb2gray
+import matplotlib.pyplot as plt
+import numpy as np
+from skimage.filters import threshold_otsu
+import os
+from skimage.graph import route_through_array
+from heapq import heappush, heappop
+from loguru import logger
+def heuristic(a, b):
+    """Calculate the squared distance between two points."""
+    return (b[0] - a[0]) ** 2 + (b[1] - a[1]) ** 2
+def get_binary(img):
+    """Binarize the image using Otsu's threshold."""
+    mean = np.mean(img)
+    if mean == 0.0 or mean == 1.0:
+        return img
+    thresh = threshold_otsu(img)
+    binary = img <= thresh
+    binary = binary.astype(np.uint8)
+    return binary
+def astar(array, start, goal):
+    """Perform A* algorithm to find a path from start to goal in a binary array."""
+    neighbors = [(0,1),(0,-1),(1,0),(-1,0),(1,1),(1,-1),(-1,1),(-1,-1)]
+    close_set = set()
+    came_from = {}
+    gscore = {start:0}
+    fscore = {start:heuristic(start, goal)}
+    oheap = []
+    heappush(oheap, (fscore[start], start))
+    while oheap:
+        current = heappop(oheap)[1]
+        if current == goal:
+            data = []
+            while current in came_from:
+                data.append(current)
+                current = came_from[current]
+            return data
+        close_set.add(current)
+        for i, j in neighbors:
+            neighbor = current[0] + i, current[1] + j
+            tentative_g_score = gscore[current] + heuristic(current, neighbor)
+            if 0 <= neighbor[0] < array.shape[0]:
+                if 0 <= neighbor[1] < array.shape[1]:
+                    if array[neighbor[0]][neighbor[1]] == 1:
+                        continue
+                else:
+                    # array bound y walls
+                    continue
+            else:
+                # array bound x walls
+                continue
+            if neighbor in close_set and tentative_g_score >= gscore.get(neighbor, 0):
+                continue
+            if tentative_g_score < gscore.get(neighbor, 0) or neighbor not in [i[1] for i in oheap]:
+                came_from[neighbor] = current
+                gscore[neighbor] = tentative_g_score
+                fscore[neighbor] = tentative_g_score + heuristic(neighbor, goal)
+                heappush(oheap, (fscore[neighbor], neighbor))
+    return []
+def preprocess_image(img, target_size):
+    """Read and convert an image to grayscale."""
+    try:
+        if target_size is not None:
+            img = img[target_size[0]:target_size[1], target_size[2]:target_size[3],:]
+        if img.ndim == 3 and img.shape[2] == 4:
+            img = img[..., :3]
+        if img.ndim > 2:
+            img = rgb2gray(img)
+        return img
+    except Exception as e:
+        print(f"Error in preprocessing: {e}")
+        return None
+def horizontal_projections(sobel_image):
+    """Calculate horizontal projections of the binary image."""
+    return np.sum(sobel_image, axis=1)
+def binarize_image(image):
+    """Binarize an image using Otsu's threshold."""
+    threshold = threshold_otsu(image)
+    return image < threshold
+def find_peak_regions(hpp, threshold):
+    """Identify peak regions based on the horizontal projection profile."""
+    peaks = []
+    for i, hppv in enumerate(hpp):
+        if hppv < threshold:
+            peaks.append(i)
+    return peaks
+def line_segmentation(image, threshold=None, min_peak_group_size=7, target_size=None,
+                     ct=0, parent_line_num=None, recursive=False, recursive_count=1,
+                     base_key="line"):
+    """
+    Segment an image into lines using horizontal projections and A*.
+    Args:
+        image: Input image (numpy array)
+        threshold (float, optional): Threshold for peak detection
+        min_peak_group_size (int): Minimum size of peak groups to consider
+        target_size (tuple, optional): Target size for image preprocessing
+        ct (int): Counter for line numbering
+        parent_line_num (str, optional): Parent line number for recursive segmentation
+        recursive (bool): Whether this is a recursive call
+        recursive_count (int): Counter for recursive segmentation numbering
+        base_key (str): Base key for dictionary entries
+    Returns:
+        tuple: (segmented_images_dict, counter value, bool indicating if valid separations were found)
+    """
+    segmented_images_dict = {}
+    img = preprocess_image(image, target_size)
+    if img is None:
+        print(f"Failed to preprocess image")
+        return segmented_images_dict, ct, False
+    # Binarize image and get projections
+    binarized_image = binarize_image(img)
+    hpp = horizontal_projections(binarized_image)
+    if threshold is None:
+        threshold = (np.max(hpp) - np.min(hpp)) / 2
+    # Find peaks
+    peaks = find_peak_regions(hpp, threshold)
+    if not peaks:
+        print(f"No peaks found in image")
+        return segmented_images_dict, ct, False
+    peaks_indexes = np.array(peaks).astype(int)
+    segmented_img = np.copy(img)
+    r, c = segmented_img.shape
+    for ri in range(r):
+        if ri in peaks_indexes:
+            segmented_img[ri, :] = 0
+    # Group peaks
+    diff_between_consec_numbers = np.diff(peaks_indexes)
+    indexes_with_larger_diff = np.where(diff_between_consec_numbers > 1)[0].flatten()
+    peak_groups = np.split(peaks_indexes, indexes_with_larger_diff + 1)
+    peak_groups = [item for item in peak_groups if len(item) > min_peak_group_size]
+    if not peak_groups:
+        print(f"No valid peak groups found in image")
+        return segmented_images_dict, ct, False
+    binary_image = get_binary(img)
+    segment_separating_lines = []
+    for sub_image_index in peak_groups:
+        try:
+            start_row = sub_image_index[0]
+            end_row = sub_image_index[-1]
+            start_row = max(0, start_row)
+            end_row = min(binary_image.shape[0], end_row)
+            if end_row <= start_row:
+                continue
+            nmap = binary_image[start_row:end_row, :]
+            if nmap.size == 0:
+                continue
+            start_point = (int(nmap.shape[0] / 2), 0)
+            end_point = (int(nmap.shape[0] / 2), nmap.shape[1] - 1)
+            path, _ = route_through_array(nmap, start_point, end_point)
+            path = np.array(path) + start_row
+            segment_separating_lines.append(path)
+        except Exception as e:
+            print(f"Failed to process sub-image: {e}")
+            continue
+    if not segment_separating_lines:
+        print(f"No valid segment separating lines found in image")
+        return segmented_images_dict, ct, False
+    # Separate images based on line segments
+    seperated_images = []
+    for index in range(len(segment_separating_lines) - 1):
+        try:
+            lower_line = np.min(segment_separating_lines[index][:, 0])
+            upper_line = np.max(segment_separating_lines[index + 1][:, 0])
+            if lower_line < upper_line and upper_line <= img.shape[0]:
+                line_image = img[lower_line:upper_line]
+                if line_image.size > 0:
+                    seperated_images.append(line_image)
+        except Exception as e:
+            print(f"Failed to separate image at index {index}: {e}")
+            continue
+    if not seperated_images:
+        print(f"No valid separated images found in image")
+        return segmented_images_dict, ct, False
+    # Calculate height threshold
+    try:
+        image_heights = [line_image.shape[0] for line_image in seperated_images if line_image.size > 0]
+        if not image_heights:
+            print(f"No valid image heights found")
+            return segmented_images_dict, ct, False
+        height_threshold = np.percentile(image_heights, 90)
+    except Exception as e:
+        print(f"Failed to calculate height threshold: {e}")
+        return segmented_images_dict, ct, False
+    # Process each separated image
+    for index, line_image in enumerate(seperated_images):
+        try:
+            if line_image.size == 0 or line_image.shape[0] == 0 or line_image.shape[1] == 0:
+                continue
+            if parent_line_num is None:
+                dict_key = f'{base_key}_{ct + 1}'
+            else:
+                dict_key = f'{base_key}_{recursive_count}'
+                if index < len(seperated_images) - 1:
+                    continue
+            segmented_images_dict[dict_key] = {
+                "image": line_image.copy(),
+                "transcription": f"{dict_key}"
+            }
+            # print(f"Added line image to dictionary with key: {dict_key}")
+            # Handle recursive segmentation
+            if line_image.shape[0] > height_threshold and not recursive:
+                try:
+                    # Create recursive base key
+                    recursive_base_key = f"{base_key}_{ct + 1}"
+                    # Do recursive segmentation
+                    recursive_dict, ct, found_valid_separations = line_segmentation(
+                        line_image, threshold=threshold,
+                        min_peak_group_size=3,
+                        parent_line_num=str(ct + 1),
+                        recursive=True,
+                        ct=ct,
+                        recursive_count=1,
+                        base_key=recursive_base_key
+                    )
+                    if found_valid_separations:
+                        del segmented_images_dict[dict_key]
+                        segmented_images_dict.update(recursive_dict)
+                        print(f"Replaced {dict_key} with recursive segmentation results")
+                    else:
+                        print(f"Keeping original image {dict_key} as no valid separations were found")
+                except Exception as e:
+                    print(f"Failed during recursive segmentation of {dict_key}: {e}")
+            ct += 1
+            if recursive:
+                recursive_count += 1
+        except Exception as e:
+            print(f"Failed to process line image at index {index}: {e}")
+            continue
+    logger.info(f"Total lines segment found: {len(segmented_images_dict)}")
+    return segmented_images_dict, ct, len(seperated_images) > 0
+def segment_image_to_lines(image_array, **kwargs):
+    """
+    Convenience function to segment an image into lines.
+    Args:
+        image_array: Input image as numpy array
+        **kwargs: Additional arguments for line_segmentation
+    Returns:
+        dict: Dictionary with line keys and segmented image arrays as values
+    """
+    try:
+        logger.info("Starting line segmentation...")
+        segmented_dict, _, success = line_segmentation(image_array, **kwargs)
+        if success:
+            logger.info(f"Line segmentation successful.....")
+        return segmented_dict
+    except Exception as e:
+        logger.error(f"Line segmentation failed: {e}")
+        return {}
+# if __name__ == "__main__":
+#     # Example usage
+#     image_path = "./renAI-deploy/1.png"
+#     image = imread(image_path)
+#     segmented_lines = segment_image_to_lines(image, threshold=None, min_peak_group_size=10)
+#     print(len(segmented_lines.values()))
+#     for key, value in segmented_lines.items():
+#         print(f"{key}: {value['image'].shape}")
+#         print(f"{key}: {value['transcription']}")
+#         # plt.imshow(img, cmap='gray')
+#         # plt.title(key)
+#         # plt.show()

utils/postprocessing.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import re
+import unicodedata
+from collections import defaultdict
+from typing import List, Tuple, Dict, Set
+import heapq
+from loguru import logger
+class SpanishFuzzyMatcher:
+    def __init__(self, dictionary_path: str):
+        self.dictionary = set()
+        self.word_by_length = defaultdict(list)
+        self.ngram_index = defaultdict(set)
+        self.common_words = set()
+        self._load_dictionary(dictionary_path)
+        self._build_indexes()
+        self._load_common_words()
+    def _detect_encoding(self, path: str) -> str:
+        encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252', 'utf-16']
+        for encoding in encodings:
+            try:
+                with open(path, 'r', encoding=encoding) as f:
+                    f.read(1024)  # Try to read first 1KB
+                return encoding
+            except (UnicodeDecodeError, UnicodeError):
+                continue
+        return 'utf-8'
+    def _load_dictionary(self, path: str):
+        try:
+            encoding = self._detect_encoding(path)
+            print(f"Detected encoding: {encoding}")
+            with open(path, 'r', encoding=encoding, errors='ignore') as f:
+                for line_num, line in enumerate(f, 1):
+                    try:
+                        word = line.strip().lower()
+                        if word and len(word) > 1:
+                            # Remove any non-alphabetic characters except hyphens and apostrophes
+                            cleaned_word = re.sub(r"[^a-záéíóúüñç\-']", "", word)
+                            if cleaned_word and len(cleaned_word) > 1:
+                                self.dictionary.add(cleaned_word)
+                                self.word_by_length[len(cleaned_word)].append(cleaned_word)
+                    except Exception as e:
+                        print(f"Warning: Skipping line {line_num} due to error: {e}")
+                        continue
+            print(f"Loaded {len(self.dictionary)} words from dictionary")
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Dictionary file not found: {path}")
+        except Exception as e:
+            raise Exception(f"Error loading dictionary: {e}")
+    def _load_common_words(self):
+        common_spanish = {
+            'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'las', 'del', 'los', 'una', 'mi', 'muy', 'mas', 'me', 'si', 'ya', 'todo', 'como', 'pero', 'hay', 'o', 'cuando', 'esta', 'ser', 'tiene', 'estar', 'hacer', 'sobre', 'entre', 'poder', 'antes', 'tiempo', 'año', 'casa', 'día', 'vida', 'trabajo', 'hombre', 'mujer', 'mundo', 'parte', 'momento', 'lugar', 'país', 'forma', 'manera', 'estado', 'caso', 'grupo', 'agua', 'punto', 'vez', 'donde', 'quien', 'haber', 'tener', 'hacer', 'decir', 'ir', 'ver', 'dar', 'saber', 'querer', 'llegar', 'pasar', 'deber', 'poner', 'parecer', 'quedar', 'creer', 'hablar', 'llevar', 'dejar', 'seguir', 'encontrar', 'llamar', 'venir', 'pensar', 'salir', 'volver', 'tomar', 'conocer', 'vivir', 'sentir', 'tratar', 'mirar', 'contar', 'empezar', 'esperar', 'buscar', 'existir', 'entrar', 'trabajar', 'escribir', 'perder', 'producir', 'ocurrir', 'entender', 'pedir', 'recibir', 'recordar', 'terminar', 'permitir', 'aparecer', 'conseguir', 'comenzar', 'servir', 'sacar', 'necesitar', 'mantener', 'resultar', 'leer', 'caer', 'cambiar', 'presentar', 'crear', 'abrir', 'considerar', 'oír', 'acabar', 'convertir', 'ganar', 'traer', 'realizar', 'suponer', 'comprender', 'explicar', 'dedicar', 'andar', 'estudiar', 'mano', 'cabeza', 'ojo', 'cara', 'pie', 'corazón', 'vez', 'palabra', 'número', 'color', 'mesa', 'silla', 'libro', 'papel', 'coche', 'calle', 'puerta', 'ventana', 'ciudad', 'pueblo', 'escuela', 'hospital', 'iglesia', 'tienda', 'mercado', 'banco', 'hotel', 'restaurante', 'café', 'bar', 'teatro', 'cine', 'museo', 'parque', 'jardín', 'playa', 'montaña', 'río', 'mar', 'lago', 'bosque', 'árbol', 'flor', 'animal', 'perro', 'gato', 'pájaro', 'pez', 'comida', 'pan', 'carne', 'pollo', 'pescado', 'leche', 'huevo', 'queso', 'fruta', 'verdura', 'patata', 'tomate', 'cebolla', 'ajo', 'sal', 'azúcar', 'aceite', 'vino', 'cerveza', 'café', 'té', 'agua', 'fuego', 'aire', 'tierra', 'sol', 'luna', 'estrella', 'nube', 'lluvia', 'nieve', 'viento', 'calor', 'frío', 'luz', 'sombra', 'mañana', 'tarde', 'noche', 'hoy', 'ayer', 'mañana', 'semana', 'mes', 'año', 'hora', 'minuto', 'segundo', 'lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado', 'domingo', 'enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre', 'primavera', 'verano', 'otoño', 'invierno', 'bueno', 'malo', 'grande', 'pequeño', 'alto', 'bajo', 'largo', 'corto', 'ancho', 'estrecho', 'grueso', 'delgado', 'fuerte', 'débil', 'rápido', 'lento', 'fácil', 'difícil', 'nuevo', 'viejo', 'joven', 'mayor', 'blanco', 'negro', 'rojo', 'azul', 'verde', 'amarillo', 'gris', 'marrón', 'rosa', 'naranja', 'morado', 'feliz', 'triste', 'contento', 'enfadado', 'cansado', 'aburrido', 'interesante', 'divertido', 'importante', 'necesario', 'posible', 'imposible', 'seguro', 'peligroso', 'rico', 'pobre', 'caro', 'barato', 'limpio', 'sucio', 'sano', 'enfermo', 'vivo', 'muerto', 'lleno', 'vacío', 'abierto', 'cerrado', 'caliente', 'frío', 'seco', 'mojado', 'duro', 'blando', 'suave', 'áspero', 'dulce', 'amargo', 'salado', 'picante', 'conocerte', 'tengas'
+        }
+        self.common_words = {word for word in common_spanish if word in self.dictionary}
+        print(f"Loaded {len(self.common_words)} common words")
+    def _is_common_spanish_error(self, ocr_word: str, dict_word: str) -> bool:
+        ocr_lower = ocr_word.lower()
+        dict_lower = dict_word.lower()
+        # Common OCR confusions in Spanish
+        ocr_substitutions = {
+            'b': 'v', 'v': 'b',  # b/v confusion
+            'c': 's', 's': 'c',  # c/s confusion
+            'z': 's', 's': 'z',  # z/s confusion
+            'j': 'g', 'g': 'j',  # j/g confusion
+            'y': 'i', 'i': 'y',  # y/i confusion
+            'u': 'n', 'n': 'u',  # u/n confusion (handwriting)
+            'll': 'y', 'y': 'll', # ll/y confusion
+            'ñ': 'n', 'n': 'ñ',  # ñ/n confusion
+        }
+        if len(ocr_lower) == len(dict_lower):
+            diff_count = sum(1 for a, b in zip(ocr_lower, dict_lower) if a != b)
+            if diff_count == 1:
+                for i, (a, b) in enumerate(zip(ocr_lower, dict_lower)):
+                    if a != b:
+                        return a in ocr_substitutions and ocr_substitutions[a] == b
+        return False
+    def _build_indexes(self):
+        for word in self.dictionary:
+            padded_word = f"${word}$"
+            for i in range(len(padded_word) - 2):
+                trigram = padded_word[i:i+3]
+                self.ngram_index[trigram].add(word)
+    def _normalize_text(self, text: str) -> str:
+        text = unicodedata.normalize('NFD', text)
+        text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
+        return text.lower()
+    def _levenshtein_distance(self, s1: str, s2: str) -> int:
+        if len(s1) < len(s2):
+            return self._levenshtein_distance(s2, s1)
+        if len(s2) == 0:
+            return len(s1)
+        previous_row = list(range(len(s2) + 1))
+        for i, c1 in enumerate(s1):
+            current_row = [i + 1]
+            for j, c2 in enumerate(s2):
+                insertions = previous_row[j + 1] + 1
+                deletions = current_row[j] + 1
+                substitutions = previous_row[j] + (c1 != c2)
+                current_row.append(min(insertions, deletions, substitutions))
+            previous_row = current_row
+        return previous_row[-1]
+    def _damerau_levenshtein_distance(self, s1: str, s2: str) -> int:
+        len1, len2 = len(s1), len(s2)
+        da = {}
+        for char in s1 + s2:
+            if char not in da:
+                da[char] = 0
+        max_dist = len1 + len2
+        h = [[max_dist for _ in range(len2 + 2)] for _ in range(len1 + 2)]
+        h[0][0] = max_dist
+        for i in range(0, len1 + 1):
+            h[i + 1][0] = max_dist
+            h[i + 1][1] = i
+        for j in range(0, len2 + 1):
+            h[0][j + 1] = max_dist
+            h[1][j + 1] = j
+        for i in range(1, len1 + 1):
+            db = 0
+            for j in range(1, len2 + 1):
+                k = da[s2[j - 1]]
+                l = db
+                if s1[i - 1] == s2[j - 1]:
+                    cost = 0
+                    db = j
+                else:
+                    cost = 1
+                h[i + 1][j + 1] = min(
+                    h[i][j] + cost,  # substitution
+                    h[i + 1][j] + 1,  # insertion
+                    h[i][j + 1] + 1,  # deletion
+                    h[k][l] + (i - k - 1) + 1 + (j - l - 1)  # transposition
+                )
+            da[s1[i - 1]] = i
+        return h[len1 + 1][len2 + 1]
+    def _jaro_winkler_similarity(self, s1: str, s2: str) -> float:
+        def jaro_similarity(s1: str, s2: str) -> float:
+            if s1 == s2:
+                return 1.0
+            len1, len2 = len(s1), len(s2)
+            if len1 == 0 or len2 == 0:
+                return 0.0
+            match_window = max(len1, len2) // 2 - 1
+            if match_window < 0:
+                match_window = 0
+            s1_matches = [False] * len1
+            s2_matches = [False] * len2
+            matches = 0
+            transpositions = 0
+            for i in range(len1):
+                start = max(0, i - match_window)
+                end = min(i + match_window + 1, len2)
+                for j in range(start, end):
+                    if s2_matches[j] or s1[i] != s2[j]:
+                        continue
+                    s1_matches[i] = s2_matches[j] = True
+                    matches += 1
+                    break
+            if matches == 0:
+                return 0.0
+            k = 0
+            for i in range(len1):
+                if not s1_matches[i]:
+                    continue
+                while not s2_matches[k]:
+                    k += 1
+                if s1[i] != s2[k]:
+                    transpositions += 1
+                k += 1
+            jaro = (matches / len1 + matches / len2 +
+                   (matches - transpositions / 2) / matches) / 3
+            return jaro
+        jaro = jaro_similarity(s1, s2)
+        prefix_len = 0
+        for i in range(min(len(s1), len(s2), 4)):
+            if s1[i] == s2[i]:
+                prefix_len += 1
+            else:
+                break
+        return jaro + (0.1 * prefix_len * (1 - jaro))
+    def _get_candidates(self, word: str, max_candidates: int = 200) -> Set[str]:
+        candidates = set()
+        word_len = len(word)
+        common_candidates = set()
+        for common_word in self.common_words:
+            if abs(len(common_word) - word_len) <= 2:
+                common_candidates.add(common_word)
+        candidates.update(common_candidates)
+        for length in range(max(1, word_len - 2), word_len + 3):
+            length_words = self.word_by_length[length]
+            # Sort by length (shorter words first) and limit
+            sorted_words = sorted(length_words, key=len)[:max_candidates//3]
+            candidates.update(sorted_words)
+        padded_word = f"${word}$"
+        trigram_candidates = set()
+        trigram_scores = defaultdict(int)
+        for i in range(len(padded_word) - 2):
+            trigram = padded_word[i:i+3]
+            if trigram in self.ngram_index:
+                for candidate in self.ngram_index[trigram]:
+                    trigram_scores[candidate] += 1
+        sorted_trigram = sorted(trigram_scores.items(), key=lambda x: x[1], reverse=True)
+        trigram_candidates = {word for word, score in sorted_trigram[:max_candidates//2]}
+        candidates.update(trigram_candidates)
+        return candidates
+    def _calculate_composite_score(self, word1: str, word2: str) -> float:
+        norm_word1 = self._normalize_text(word1)
+        norm_word2 = self._normalize_text(word2)
+        levenshtein = self._levenshtein_distance(norm_word1, norm_word2)
+        damerau = self._damerau_levenshtein_distance(norm_word1, norm_word2)
+        jaro_winkler = self._jaro_winkler_similarity(norm_word1, norm_word2)
+        max_len = max(len(norm_word1), len(norm_word2))
+        if max_len == 0:
+            return 1.0
+        levenshtein_sim = 1 - (levenshtein / max_len)
+        damerau_sim = 1 - (damerau / max_len)
+        length_diff = abs(len(norm_word1) - len(norm_word2))
+        length_penalty = 1 - (length_diff / max(len(norm_word1), len(norm_word2)))
+        frequency_bonus = 1.0
+        if norm_word2 in self.common_words:
+            frequency_bonus = 1.3
+        spanish_error_bonus = 1.0
+        if self._is_common_spanish_error(word1, word2):
+            spanish_error_bonus = 1.2
+        exact_length_bonus = 1.0
+        if len(norm_word1) == len(norm_word2):
+            exact_length_bonus = 1.1
+        base_score = (
+            0.25 * levenshtein_sim +
+            0.45 * damerau_sim +
+            0.25 * jaro_winkler +
+            0.05 * length_penalty
+        )
+        final_score = base_score * frequency_bonus * spanish_error_bonus * exact_length_bonus
+        return min(final_score, 1.0)
+    def find_best_matches(self, word: str, top_k: int = 5, threshold: float = 0.4) -> List[Tuple[str, float]]:
+        if not word or len(word) < 2:
+            return []
+        normalized_word = self._normalize_text(word)
+        if normalized_word in self.dictionary:
+            return [(word, 1.0)]
+        if word.lower() in self.dictionary:
+            return [(word.lower(), 1.0)]
+        candidates = self._get_candidates(normalized_word)
+        scored_matches = []
+        for candidate in candidates:
+            score = self._calculate_composite_score(word, candidate)
+            if score >= threshold:
+                heapq.heappush(scored_matches, (-score, candidate, score))
+        results = []
+        seen_words = set()
+        for _ in range(min(top_k, len(scored_matches))):
+            if scored_matches:
+                _, candidate, score = heapq.heappop(scored_matches)
+                if candidate not in seen_words:
+                    results.append((candidate, score))
+                    seen_words.add(candidate)
+        return results
+    def correct_sentence(self, sentence: str, confidence_threshold: float = 0.6) -> str:
+        words = re.findall(r'\b\w+\b|\W+', sentence)
+        corrected_words = []
+        for token in words:
+            if re.match(r'\b\w+\b', token):
+                matches = self.find_best_matches(token, top_k=1, threshold=0.3)
+                if matches and matches[0][1] >= confidence_threshold:
+                    corrected_words.append(matches[0][0])
+                else:
+                    corrected_words.append(token)
+            else:
+                corrected_words.append(token)
+        return ''.join(corrected_words)
+def PostProcessing(ocr_sentence):
+    try:
+        logger.info("Post processing started......")
+        matcher = SpanishFuzzyMatcher('Diccionario.Espanol.136k.palabras.txt')
+        logger.info("Dictionary loaded successfully!")
+        corrected = matcher.correct_sentence(ocr_sentence, confidence_threshold=0.6)
+        logger.info("Post processing completed successfully!")
+        return corrected
+    except Exception as e:
+        print(e)
+        logger.error(f"Post processing failed: {e}")
+        return ocr_sentence

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+from deskew import determine_skew
+from typing import Tuple, Union
+import math
+from loguru import logger
+def preprocessImage(image):
+    """
+    Preprocesses an image by applying various image processing steps such as denoising, thresholding,
+    and removal of horizontal and vertical lines, and saves the final processed image.
+    Args:
+    - image_path (str): The file path to the input image to be processed.
+    - folder_path (str): The directory where the final processed image will be saved.
+    Returns:
+    - str: The path of the final processed image.
+    """
+    # Convert the image to grayscale
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    # Apply denoising
+    gray = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+    # Apply binary thresholding using Otsu's method
+    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    # Copy the original image to preserve it
+    removed = image.copy()
+    # Remove vertical lines
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
+    remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
+    cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    for c in cnts:
+        cv2.drawContours(removed, [c], -1, (255, 255, 255), 4)
+    # Remove horizontal lines
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
+    remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
+    cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    for c in cnts:
+        cv2.drawContours(removed, [c], -1, (255, 255, 255), 5)
+    # Repair kernel
+    repair_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    removed = 255 - removed
+    dilate = cv2.dilate(removed, repair_kernel, iterations=5)
+    dilate = cv2.cvtColor(dilate, cv2.COLOR_BGR2GRAY)
+    pre_result = cv2.bitwise_and(dilate, thresh)
+    # Final result
+    result = cv2.morphologyEx(pre_result, cv2.MORPH_CLOSE, repair_kernel, iterations=5)
+    final = cv2.bitwise_and(result, thresh)
+    # Invert the final image
+    invert_final = 255 - final
+    # processed_image_path = os.path.join(folder_path, f"{os.path.splitext(os.path.basename(image_path))[0]}-preprocessed.png")
+    # Save the final image
+    # cv2.imwrite(processed_image_path, invert_final)
+    return invert_final
+def process_segment_and_crop_image(model, image, preprocess_image_path, padding=10, min_contour_area=100):
+    """
+    Processes an image for segmentation using a U-Net model and crops the original image based on the largest contour.
+    Args:
+    - model (tf.keras.Model): Trained U-Net model for image segmentation.
+    - img_path (str): Path to the original image.
+    - preprocess_image_path (str): Path to the preprocessed image.
+    - output_folder (str): Folder to save the cropped image.
+    - padding (int): Padding around the detected region.
+    - min_contour_area (int): Minimum contour area to be considered for cropping.
+    Returns:
+    - str: The path of the cropped image.
+    """
+    # Read the original image in grayscale
+    img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    # Apply thresholding to create a binary image
+    _, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
+    # Resize the image to the model input size (512x512)
+    img = cv2.resize(img, (512, 512))
+    # Expand dimensions to match model input
+    img = np.expand_dims(img, axis=-1)
+    img_np = np.expand_dims(img, axis=0)
+    # Predict the segmentation mask using the U-Net model
+    pred = model.predict(img_np)
+    pred = np.squeeze(np.squeeze(pred, axis=0), axis=-1)
+    # # Display the segmentation result
+    # plt.imshow(pred, cmap='gray')
+    # plt.title('U-Net Segmentation')
+    # plt.axis('off')
+    # plt.show()
+    # Read the original image
+    original_img = cv2.imread(preprocess_image_path)
+    # Get original dimensions
+    ori_height, ori_width = original_img.shape[:2]
+    # Resize the mask to match the original image dimensions
+    resized_mask = cv2.resize(pred, (ori_width, ori_height))
+    # Convert the resized mask to 8-bit unsigned integer type
+    resized_mask = (resized_mask * 255).astype(np.uint8)
+    # Apply Otsu's threshold to get a binary image
+    _, binary_mask = cv2.threshold(resized_mask, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # Apply morphological operations to remove noise and connect nearby text
+    kernel = np.ones((5, 5), np.uint8)
+    cleaned_mask = cv2.morphologyEx(binary_mask, cv2.MORPH_CLOSE, kernel)
+    cleaned_mask = cv2.morphologyEx(cleaned_mask, cv2.MORPH_OPEN, kernel)
+    # Find contours in the cleaned mask
+    contours, _ = cv2.findContours(cleaned_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    # Filter contours based on area to remove small noise
+    valid_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_contour_area]
+    if not valid_contours:
+        print("No valid text regions found.")
+        return None
+    # Find the bounding rectangle that encompasses all valid contours
+    x_min, y_min = ori_width, ori_height
+    x_max, y_max = 0, 0
+    for contour in valid_contours:
+        x, y, w, h = cv2.boundingRect(contour)
+        x_min = min(x_min, x)
+        y_min = min(y_min, y)
+        x_max = max(x_max, x + w)
+        y_max = max(y_max, y + h)
+    x_min = max(0, x_min - padding)
+    y_min = max(0, y_min - padding)
+    x_max = min(ori_width, x_max + padding)
+    y_max = min(ori_height, y_max + padding)
+    # Crop the original image
+    cropped_img = original_img[y_min:y_max, x_min:x_max]
+    return cropped_img
+def postProcessImage(cropped_image):
+    """
+    Post-processes an image by deskewing, sharpening, and applying morphological dilation, then saves the final processed image.
+    Args:
+    - image_path (str): Path to the original image.
+    - cropped_image_path (str): Path to the cropped image to be post-processed.
+    - output_folder (str): Directory where the final post-processed image will be saved.
+    Returns:
+    - str: The path of the final post-processed image.
+    """
+    def rotate(
+        image: np.ndarray, angle: float, background: Union[int, Tuple[int, int, int]]
+    ) -> np.ndarray:
+        old_width, old_height = image.shape[:2]
+        angle_radian = math.radians(angle)
+        width = abs(np.sin(angle_radian) * old_height) + abs(np.cos(angle_radian) * old_width)
+        height = abs(np.sin(angle_radian) * old_width) + abs(np.cos(angle_radian) * old_height)
+        image_center = tuple(np.array(image.shape[1::-1]) / 2)
+        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
+        rot_mat[1, 2] += (width - old_width) / 2
+        rot_mat[0, 2] += (height - old_height) / 2
+        return cv2.warpAffine(image, rot_mat, (int(round(height)), int(round(width))), borderValue=background)
+    # Deskew Image
+    # grayscale = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2GRAY)
+    # angle = determine_skew(grayscale)
+    # rotated = rotate(image, angle, (0, 0, 0))
+    rotated = cropped_image
+    # Sharpening (reduced intensity)
+    blurred = cv2.GaussianBlur(rotated, (1,1), sigmaX=3, sigmaY=3)
+    sharpened = cv2.addWeighted(rotated, 1.5, blurred, -0.5, 0)
+    # Morphological dilation to thicken the text
+    dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
+    dilated = cv2.dilate(sharpened, dilate_kernel, iterations=1)
+    return sharpened

utils/unet.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#Importing required libraries.
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+import os
+from keras.layers import *
+from keras.models import Model
+from keras.optimizers import Adam
+import random
+def unet(pretrained_weights = None,input_size = (512,512,1)):
+        inputs = Input(input_size)
+        conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(inputs)
+        conv1 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv1)
+        pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
+        conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool1)
+        conv2 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv2)
+        pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
+        conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool2)
+        conv3 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv3)
+        pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
+        conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool3)
+        conv4 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv4)
+        drop4 = Dropout(0.5)(conv4)
+        pool4 = MaxPooling2D(pool_size=(2, 2))(drop4)
+        conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(pool4)
+        conv5 = Conv2D(1024, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv5)
+        drop5 = Dropout(0.5)(conv5)
+        up6 = Conv2D(512, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(drop5))
+        merge6 = concatenate([drop4,up6], axis = 3)
+        conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge6)
+        conv6 = Conv2D(512, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv6)
+        up7 = Conv2D(256, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv6))
+        merge7 = concatenate([conv3,up7], axis = 3)
+        conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge7)
+        conv7 = Conv2D(256, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv7)
+        up8 = Conv2D(128, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv7))
+        merge8 = concatenate([conv2,up8], axis = 3)
+        conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge8)
+        conv8 = Conv2D(128, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv8)
+        up9 = Conv2D(64, 2, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv8))
+        merge9 = concatenate([conv1,up9], axis = 3)
+        conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(merge9)
+        conv9 = Conv2D(64, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9)
+        conv9 = Conv2D(2, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_normal')(conv9)
+        conv10 = Conv2D(1, 1, activation = 'sigmoid')(conv9)
+        model = Model(inputs,conv10)
+        model.compile(optimizer = Adam(learning_rate=1e-4), loss = 'binary_crossentropy', metrics = ['accuracy'])
+        #model.summary()
+        if(pretrained_weights):
+            model.load_weights(pretrained_weights)
+        return model

vit.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import numpy as np
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import torch
+from torch.utils.data import Dataset, DataLoader
+import torch
+from torch.utils.data import Dataset, DataLoader, random_split
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Trainer, TrainingArguments, EarlyStoppingCallback
+from PIL import Image
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
+from torch.optim import AdamW
+import torch.nn.functional as F
+from evaluate import load
+import albumentations as A
+import os
+from configs import model_path, processor_path
+# Enable mixed precision training
+torch.backends.cudnn.benchmark = True
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+# Load metrics
+cer_metric = load("cer")
+wer_metric = load("wer")
+processor = TrOCRProcessor.from_pretrained(processor_path, do_rescale=False,use_fast=True)
+model = VisionEncoderDecoderModel.from_pretrained(model_path,use_safetensors=True)
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    if isinstance(logits, tuple):
+        logits = logits[0]
+    predictions = logits.argmax(-1)
+    decoded_preds = processor.tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    decoded_labels = []
+    for label in labels:
+        label_filtered = [token for token in label if token != -100]
+        decoded_label = processor.tokenizer.decode(label_filtered, skip_special_tokens=True)
+        decoded_labels.append(decoded_label)
+    cer_score = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)
+    wer_score = wer_metric.compute(predictions=decoded_preds, references=decoded_labels)
+    return {"cer": cer_score, "wer": wer_score}
+class LineDataset(Dataset):
+    def __init__(self, processor, model, line_images, texts, target_size=(384, 96), max_length=512, apply_augmentation=False):
+        self.line_images = line_images
+        self.texts = texts
+        self.processor = processor
+        self.processor.image_processor.max_length = max_length
+        self.processor.tokenizer.model_max_length = max_length
+        self.model = model
+        self.model.config.max_length = max_length
+        self.target_size = target_size
+        self.max_length = max_length
+        self.apply_augmentation = apply_augmentation
+        if apply_augmentation:
+            self.transform = A.Compose([
+                A.OneOf([
+                    A.Rotate(limit=2, p=1.0),
+                    A.ElasticTransform(alpha=0.3, sigma=50.0, alpha_affine=0.3, p=1.0),
+                    A.OpticalDistortion(distort_limit=0.03, shift_limit=0.03, p=1.0),
+                    A.CLAHE(clip_limit=2, tile_grid_size=(4, 4), p=1.0),
+                    A.Affine(scale=(0.95, 1.05), translate_percent=(0.02, 0.02), shear=(-2, 2), p=1.0),
+                    A.Perspective(scale=(0.01, 0.03), p=1.0),
+                    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=1.0),
+                    A.GaussianBlur(blur_limit=(3, 7), p=1.0),
+                    A.GridDistortion(num_steps=3, distort_limit=0.02, p=1.0),
+                    A.MedianBlur(blur_limit=3, p=1.0),
+                ], p=0.7),
+            ])
+        else:
+            self.transform = A.Compose([])
+    def __len__(self):
+        return len(self.line_images)
+    def __getitem__(self, idx):
+        image = self.line_images[idx]
+        text = self.texts[idx]
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        if image.ndim == 2:
+            image = np.expand_dims(image, axis=-1)
+            image = np.repeat(image, 3, axis=-1)
+        image = (image * 255).astype(np.uint8)
+        if self.apply_augmentation and self.transform:
+            augmented = self.transform(image=image)
+            image = augmented['image']
+        image = Image.fromarray(image)
+        image = image.resize(self.target_size, Image.LANCZOS)
+        image = np.array(image) / 255.0
+        image = np.transpose(image, (2, 0, 1))
+        encoding = self.processor(images=image, text=text, return_tensors="pt")
+        encoding['labels'] = encoding['labels'][:, :self.max_length]
+        encoding = {k: v.squeeze() for k, v in encoding.items()}
+        return encoding
+def collate_fn(batch):
+    pixel_values = torch.stack([item['pixel_values'] for item in batch])
+    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)
+    return {'pixel_values': pixel_values, 'labels': labels}