nagpalsumit247 commited on
Commit
9a34207
·
verified ·
1 Parent(s): 489a250

Upload 4 files

Browse files
ocr_api/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ OCR API Package
3
+ Production-ready FastAPI OCR service using PaddleOCR
4
+ """
5
+
6
+ __version__ = "1.0.0"
ocr_api/main.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Application for OCR Service
3
+ Production-ready API for advanced OCR on scanned images
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ import logging
9
+ from typing import Optional
10
+ from pathlib import Path
11
+ from contextlib import asynccontextmanager
12
+
13
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Query
14
+ from fastapi.middleware.cors import CORSMiddleware
15
+ from fastapi.responses import JSONResponse
16
+ import uvicorn
17
+
18
+ from ocr_api.ocr_service import OCRService
19
+
20
+ # Setup logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Global OCR service instance
28
+ ocr_service = None
29
+
30
+ # Check for GPU availability from environment
31
+ use_gpu = os.getenv("USE_GPU", "false").lower() == "true"
32
+
33
+ # CORS allowed origins - configure for production
34
+ allowed_origins = os.getenv("CORS_ORIGINS", "*").split(",")
35
+ if allowed_origins == ["*"]:
36
+ logger.warning("CORS is configured to allow all origins. This is insecure for production.")
37
+ logger.warning("Set CORS_ORIGINS environment variable with comma-separated allowed origins.")
38
+
39
+
40
+ @asynccontextmanager
41
+ async def lifespan(app: FastAPI):
42
+ """Lifespan context manager for startup and shutdown events"""
43
+ global ocr_service
44
+ # Startup
45
+ logger.info("Initializing OCR Service...")
46
+ try:
47
+ from ocr_api.ocr_service import OCRService
48
+ ocr_service = OCRService(use_gpu=use_gpu, lang='en')
49
+ logger.info(f"OCR Service initialized successfully (GPU: {use_gpu})")
50
+ except Exception as e:
51
+ logger.warning(f"Failed to initialize PaddleOCR: {e}")
52
+ logger.info("Falling back to Mock OCR Service for testing...")
53
+ try:
54
+ from ocr_api.mock_ocr_service import MockOCRService
55
+ ocr_service = MockOCRService(use_gpu=use_gpu, lang='en')
56
+ logger.info("Mock OCR Service initialized successfully")
57
+ except Exception as mock_error:
58
+ logger.error(f"Failed to initialize Mock OCR Service: {mock_error}")
59
+ raise
60
+
61
+ yield
62
+
63
+ # Shutdown
64
+ logger.info("Shutting down OCR Service...")
65
+
66
+
67
+ # Initialize FastAPI app with lifespan
68
+ app = FastAPI(
69
+ title="Advanced OCR API",
70
+ description="Production-ready API for OCR on scanned images using PaddleOCR",
71
+ version="1.0.0",
72
+ docs_url="/docs",
73
+ redoc_url="/redoc",
74
+ lifespan=lifespan
75
+ )
76
+
77
+ # Configure CORS
78
+ app.add_middleware(
79
+ CORSMiddleware,
80
+ allow_origins=allowed_origins, # Configure via CORS_ORIGINS env var
81
+ allow_credentials=True,
82
+ allow_methods=["*"],
83
+ allow_headers=["*"],
84
+ )
85
+
86
+
87
+ @app.get("/")
88
+ async def root():
89
+ """Root endpoint with API information"""
90
+ return {
91
+ "message": "Advanced OCR API",
92
+ "version": "1.0.0",
93
+ "endpoints": {
94
+ "ocr": "/api/ocr",
95
+ "health": "/health",
96
+ "docs": "/docs"
97
+ }
98
+ }
99
+
100
+
101
+ @app.get("/health")
102
+ async def health_check():
103
+ """Health check endpoint"""
104
+ return {
105
+ "status": "healthy",
106
+ "ocr_service": "initialized" if ocr_service else "not_initialized",
107
+ "gpu_enabled": use_gpu
108
+ }
109
+
110
+
111
+ @app.post("/api/ocr")
112
+ async def perform_ocr(
113
+ file: UploadFile = File(..., description="Image file (jpg, png, tiff, pdf)")
114
+ ):
115
+ """
116
+ Perform OCR on uploaded image
117
+
118
+ Args:
119
+ file: Uploaded image file
120
+
121
+ Returns:
122
+ Structured JSON response with OCR results
123
+ """
124
+ if not ocr_service:
125
+ raise HTTPException(status_code=503, detail="OCR service not initialized")
126
+
127
+ # Validate file type
128
+ allowed_extensions = {'.jpg', '.jpeg', '.png', '.tiff', '.tif', '.pdf'}
129
+ file_ext = Path(file.filename).suffix.lower() if file.filename else ''
130
+
131
+ if file_ext not in allowed_extensions:
132
+ raise HTTPException(
133
+ status_code=400,
134
+ detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}"
135
+ )
136
+
137
+ # Create temporary file to store upload
138
+ temp_file = None
139
+ try:
140
+ # Save uploaded file to temporary location
141
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp:
142
+ content = await file.read()
143
+ temp.write(content)
144
+ temp_file = temp.name
145
+ logger.info(f"Processing uploaded file: {file.filename} ({len(content)} bytes)")
146
+
147
+ # Process image with OCR
148
+ result = ocr_service.process_image(temp_file)
149
+
150
+ logger.info(f"OCR processing completed for {file.filename}")
151
+ return JSONResponse(content=result)
152
+
153
+ except ValueError as e:
154
+ logger.error(f"Invalid image: {e}")
155
+ raise HTTPException(status_code=400, detail=str(e))
156
+ except Exception as e:
157
+ logger.error(f"OCR processing failed: {e}", exc_info=True)
158
+ raise HTTPException(status_code=500, detail=f"OCR processing failed: {str(e)}")
159
+ finally:
160
+ # Clean up temporary file
161
+ if temp_file and os.path.exists(temp_file):
162
+ try:
163
+ os.unlink(temp_file)
164
+ except Exception as e:
165
+ logger.warning(f"Failed to delete temporary file: {e}")
166
+
167
+
168
+ def main():
169
+ """Run the application"""
170
+ port = int(os.getenv("PORT", 8000))
171
+ host = os.getenv("HOST", "0.0.0.0")
172
+
173
+ logger.info(f"Starting OCR API server on {host}:{port}")
174
+ uvicorn.run(
175
+ "ocr_api.main:app",
176
+ host=host,
177
+ port=port,
178
+ reload=False,
179
+ log_level="info"
180
+ )
181
+
182
+
183
+ if __name__ == "__main__":
184
+ main()
ocr_api/mock_ocr_service.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mock OCR Service for Testing
3
+ This is a simplified version for testing when PaddleOCR models cannot be downloaded
4
+ """
5
+
6
+ import logging
7
+ from typing import Dict, List, Any
8
+ import numpy as np
9
+ import cv2
10
+ from PIL import Image
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class MockOCRService:
17
+ """
18
+ Mock OCR Service for testing purposes.
19
+ Returns simulated OCR results with proper structure.
20
+ """
21
+
22
+ def __init__(self, use_gpu: bool = False, lang: str = 'en'):
23
+ """Initialize Mock OCR Service"""
24
+ self.use_gpu = use_gpu
25
+ self.lang = lang
26
+ logger.info(f"Initializing Mock OCR Service (GPU: {use_gpu}, Language: {lang})")
27
+ logger.warning("Using MOCK OCR SERVICE - not real OCR! For testing structure only.")
28
+
29
+ def process_image(self, image_path: str) -> Dict[str, Any]:
30
+ """
31
+ Process an image and return mock structured OCR results
32
+
33
+ Args:
34
+ image_path: Path to the image file
35
+
36
+ Returns:
37
+ Dictionary containing mock structured OCR results
38
+ """
39
+ # Load image to get dimensions
40
+ image = cv2.imread(image_path)
41
+ if image is None:
42
+ raise ValueError(f"Cannot read image from {image_path}")
43
+
44
+ height, width = image.shape[:2]
45
+ logger.info(f"Processing image: {width}x{height}")
46
+
47
+ # Return mock structured data
48
+ return {
49
+ "image_width": width,
50
+ "image_height": height,
51
+ "blocks": [
52
+ {
53
+ "block_id": "block_0",
54
+ "block_type": "header",
55
+ "bounding_box": [
56
+ [int(width * 0.1), int(height * 0.05)],
57
+ [int(width * 0.9), int(height * 0.05)],
58
+ [int(width * 0.9), int(height * 0.15)],
59
+ [int(width * 0.1), int(height * 0.15)]
60
+ ],
61
+ "lines": [
62
+ {
63
+ "line_id": "line_0",
64
+ "text": "Sample Document Title (Mock OCR)",
65
+ "bounding_box": [
66
+ [int(width * 0.1), int(height * 0.05)],
67
+ [int(width * 0.9), int(height * 0.05)],
68
+ [int(width * 0.9), int(height * 0.15)],
69
+ [int(width * 0.1), int(height * 0.15)]
70
+ ],
71
+ "font_size_estimate": int((height * 0.1) * 0.75),
72
+ "words": [
73
+ {
74
+ "word": "Sample",
75
+ "bounding_box": [
76
+ [int(width * 0.1), int(height * 0.05)],
77
+ [int(width * 0.25), int(height * 0.05)],
78
+ [int(width * 0.25), int(height * 0.15)],
79
+ [int(width * 0.1), int(height * 0.15)]
80
+ ],
81
+ "confidence": 0.95,
82
+ "characters": [
83
+ {
84
+ "char": c,
85
+ "bounding_box": [
86
+ [int(width * (0.1 + i * 0.025)), int(height * 0.05)],
87
+ [int(width * (0.1 + (i + 1) * 0.025)), int(height * 0.05)],
88
+ [int(width * (0.1 + (i + 1) * 0.025)), int(height * 0.15)],
89
+ [int(width * (0.1 + i * 0.025)), int(height * 0.15)]
90
+ ],
91
+ "confidence": 0.95
92
+ }
93
+ for i, c in enumerate("Sample")
94
+ ]
95
+ },
96
+ {
97
+ "word": "Document",
98
+ "bounding_box": [
99
+ [int(width * 0.27), int(height * 0.05)],
100
+ [int(width * 0.50), int(height * 0.05)],
101
+ [int(width * 0.50), int(height * 0.15)],
102
+ [int(width * 0.27), int(height * 0.15)]
103
+ ],
104
+ "confidence": 0.93,
105
+ "characters": []
106
+ },
107
+ {
108
+ "word": "Title",
109
+ "bounding_box": [
110
+ [int(width * 0.52), int(height * 0.05)],
111
+ [int(width * 0.68), int(height * 0.05)],
112
+ [int(width * 0.68), int(height * 0.15)],
113
+ [int(width * 0.52), int(height * 0.15)]
114
+ ],
115
+ "confidence": 0.96,
116
+ "characters": []
117
+ }
118
+ ]
119
+ }
120
+ ]
121
+ },
122
+ {
123
+ "block_id": "block_1",
124
+ "block_type": "paragraph",
125
+ "bounding_box": [
126
+ [int(width * 0.1), int(height * 0.2)],
127
+ [int(width * 0.9), int(height * 0.2)],
128
+ [int(width * 0.9), int(height * 0.6)],
129
+ [int(width * 0.1), int(height * 0.6)]
130
+ ],
131
+ "lines": [
132
+ {
133
+ "line_id": f"line_{i + 1}",
134
+ "text": f"This is line {i + 1} of the mock paragraph content.",
135
+ "bounding_box": [
136
+ [int(width * 0.1), int(height * (0.2 + i * 0.08))],
137
+ [int(width * 0.9), int(height * (0.2 + i * 0.08))],
138
+ [int(width * 0.9), int(height * (0.2 + (i + 1) * 0.08))],
139
+ [int(width * 0.1), int(height * (0.2 + (i + 1) * 0.08))]
140
+ ],
141
+ "font_size_estimate": 12,
142
+ "words": []
143
+ }
144
+ for i in range(5)
145
+ ]
146
+ }
147
+ ]
148
+ }
ocr_api/ocr_service.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OCR Service Module
3
+ Handles all OCR operations using PaddleOCR
4
+ """
5
+
6
+ import os
7
+ import logging
8
+ from typing import Dict, List, Any, Tuple, Optional
9
+ import numpy as np
10
+ from PIL import Image
11
+ from paddleocr import PaddleOCR
12
+ import cv2
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class OCRService:
19
+ """
20
+ Service class for OCR operations using PaddleOCR.
21
+ Supports text detection, recognition, layout parsing, and angle classification.
22
+ """
23
+
24
+ # Configuration constants
25
+ MIN_FONT_SIZE = 8 # Minimum font size in points
26
+ MAX_FONT_SIZE = 72 # Maximum font size in points
27
+ DEFAULT_HEADER_MAX_LENGTH = 50 # Max characters for header detection
28
+ DEFAULT_VERTICAL_THRESHOLD_RATIO = 0.05 # Vertical grouping threshold as ratio of image height
29
+
30
+ def __init__(self, use_gpu: bool = False, lang: str = 'en'):
31
+ """
32
+ Initialize OCR Service
33
+
34
+ Args:
35
+ use_gpu: Whether to use GPU for processing
36
+ lang: Language for OCR (default: 'en')
37
+ """
38
+ self.use_gpu = use_gpu
39
+ self.lang = lang
40
+
41
+ # Initialize PaddleOCR with all features enabled
42
+ logger.info(f"Initializing PaddleOCR (GPU: {use_gpu}, Language: {lang})")
43
+ self.ocr_engine = PaddleOCR(
44
+ use_angle_cls=True, # Enable angle classification
45
+ lang=lang,
46
+ use_gpu=use_gpu,
47
+ show_log=False,
48
+ use_space_char=True
49
+ )
50
+
51
+ # Initialize structure parser for layout analysis
52
+ try:
53
+ from paddleocr import PPStructure
54
+ self.structure_engine = PPStructure(
55
+ use_gpu=use_gpu,
56
+ lang=lang,
57
+ show_log=False,
58
+ layout=True, # Enable layout analysis
59
+ table=False, # We'll handle tables separately if needed
60
+ ocr=False # We'll use our own OCR
61
+ )
62
+ except ImportError:
63
+ logger.warning("PPStructure not available, layout parsing will be limited")
64
+ self.structure_engine = None
65
+
66
+ def process_image(self, image_path: str) -> Dict[str, Any]:
67
+ """
68
+ Process an image and return structured OCR results
69
+
70
+ Args:
71
+ image_path: Path to the image file
72
+
73
+ Returns:
74
+ Dictionary containing structured OCR results
75
+ """
76
+ # Load image
77
+ image = cv2.imread(image_path)
78
+ if image is None:
79
+ raise ValueError(f"Cannot read image from {image_path}")
80
+
81
+ # Get image dimensions
82
+ height, width = image.shape[:2]
83
+ logger.info(f"Processing image: {width}x{height}")
84
+
85
+ # Perform OCR
86
+ ocr_result = self.ocr_engine.ocr(image_path, cls=True)
87
+
88
+ # Perform layout analysis if available
89
+ layout_result = None
90
+ if self.structure_engine:
91
+ try:
92
+ layout_result = self.structure_engine(image_path)
93
+ except Exception as e:
94
+ logger.warning(f"Layout analysis failed: {e}")
95
+
96
+ # Build structured response
97
+ structured_result = self._build_structured_response(
98
+ ocr_result,
99
+ layout_result,
100
+ width,
101
+ height
102
+ )
103
+
104
+ return structured_result
105
+
106
+ def _build_structured_response(
107
+ self,
108
+ ocr_result: List,
109
+ layout_result: Optional[List],
110
+ width: int,
111
+ height: int
112
+ ) -> Dict[str, Any]:
113
+ """
114
+ Build structured JSON response from OCR results
115
+
116
+ Args:
117
+ ocr_result: Raw OCR result from PaddleOCR
118
+ layout_result: Layout analysis result
119
+ width: Image width
120
+ height: Image height
121
+
122
+ Returns:
123
+ Structured dictionary matching required schema
124
+ """
125
+ blocks = []
126
+
127
+ # Extract layout blocks if available
128
+ layout_blocks = self._extract_layout_blocks(layout_result) if layout_result else []
129
+
130
+ # Process OCR results
131
+ if ocr_result and ocr_result[0]:
132
+ # Group lines into blocks based on layout or proximity
133
+ if layout_blocks:
134
+ blocks = self._group_lines_by_layout(ocr_result[0], layout_blocks)
135
+ else:
136
+ blocks = self._group_lines_by_proximity(ocr_result[0])
137
+
138
+ return {
139
+ "image_width": width,
140
+ "image_height": height,
141
+ "blocks": blocks
142
+ }
143
+
144
+ def _extract_layout_blocks(self, layout_result: List) -> List[Dict]:
145
+ """Extract layout blocks from structure parser result"""
146
+ blocks = []
147
+ for item in layout_result:
148
+ if isinstance(item, dict) and 'type' in item:
149
+ blocks.append({
150
+ 'type': item.get('type', 'paragraph'),
151
+ 'bbox': item.get('bbox', [0, 0, 0, 0])
152
+ })
153
+ return blocks
154
+
155
+ def _group_lines_by_layout(
156
+ self,
157
+ ocr_lines: List,
158
+ layout_blocks: List[Dict]
159
+ ) -> List[Dict]:
160
+ """Group OCR lines into layout blocks"""
161
+ blocks = []
162
+
163
+ # If no layout blocks, fall back to proximity grouping
164
+ if not layout_blocks:
165
+ return self._group_lines_by_proximity(ocr_lines)
166
+
167
+ # Assign lines to layout blocks
168
+ for idx, layout_block in enumerate(layout_blocks):
169
+ block_type = layout_block.get('type', 'paragraph')
170
+ layout_bbox = layout_block.get('bbox', [0, 0, 0, 0])
171
+
172
+ # Find lines that belong to this block
173
+ block_lines = []
174
+ for line_data in ocr_lines:
175
+ line_bbox = line_data[0]
176
+ line_center = self._get_bbox_center(line_bbox)
177
+
178
+ # Check if line center is within layout block
179
+ if self._point_in_bbox(line_center, layout_bbox):
180
+ block_lines.append(line_data)
181
+
182
+ if block_lines:
183
+ blocks.append(self._create_block(
184
+ block_id=f"block_{idx}",
185
+ block_type=block_type,
186
+ lines=block_lines
187
+ ))
188
+
189
+ # Handle lines not assigned to any block
190
+ assigned_lines = set()
191
+ for block in blocks:
192
+ for line in block['lines']:
193
+ assigned_lines.add(line['line_id'])
194
+
195
+ unassigned_lines = [
196
+ line for i, line in enumerate(ocr_lines)
197
+ if f"line_{i}" not in assigned_lines
198
+ ]
199
+
200
+ if unassigned_lines:
201
+ blocks.append(self._create_block(
202
+ block_id=f"block_{len(blocks)}",
203
+ block_type="paragraph",
204
+ lines=unassigned_lines
205
+ ))
206
+
207
+ return blocks
208
+
209
+ def _group_lines_by_proximity(self, ocr_lines: List) -> List[Dict]:
210
+ """
211
+ Group OCR lines into blocks based on spatial proximity
212
+ Simple heuristic: group lines that are close vertically
213
+ """
214
+ if not ocr_lines:
215
+ return []
216
+
217
+ # Get image height for adaptive threshold (if not available, use fixed threshold)
218
+ # Calculate threshold as a percentage of image height for better adaptability
219
+ # For now, use a reasonable fixed threshold that works for most documents
220
+ threshold = 50 # Vertical distance threshold in pixels for grouping
221
+
222
+ # Sort lines by vertical position (top to bottom)
223
+ sorted_lines = sorted(
224
+ enumerate(ocr_lines),
225
+ key=lambda x: self._get_bbox_center(x[1][0])[1]
226
+ )
227
+
228
+ for orig_idx, line_data in sorted_lines:
229
+ bbox = line_data[0]
230
+ center_y = self._get_bbox_center(bbox)[1]
231
+
232
+ if last_y is None or abs(center_y - last_y) < threshold:
233
+ current_block_lines.append((orig_idx, line_data))
234
+ else:
235
+ # Start new block
236
+ if current_block_lines:
237
+ blocks.append(self._create_block(
238
+ block_id=f"block_{len(blocks)}",
239
+ block_type=self._infer_block_type(current_block_lines),
240
+ lines=[line[1] for line in current_block_lines],
241
+ line_indices=[line[0] for line in current_block_lines]
242
+ ))
243
+ current_block_lines = [(orig_idx, line_data)]
244
+
245
+ last_y = center_y
246
+
247
+ # Add last block
248
+ if current_block_lines:
249
+ blocks.append(self._create_block(
250
+ block_id=f"block_{len(blocks)}",
251
+ block_type=self._infer_block_type(current_block_lines),
252
+ lines=[line[1] for line in current_block_lines],
253
+ line_indices=[line[0] for line in current_block_lines]
254
+ ))
255
+
256
+ return blocks
257
+
258
+ def _infer_block_type(self, lines: List) -> str:
259
+ """
260
+ Infer block type based on content heuristics
261
+ Uses simple rules: single short lines without periods are likely headers
262
+ """
263
+ if not lines:
264
+ return "paragraph"
265
+
266
+ # Get first line text
267
+ first_line = lines[0][1]
268
+ text = first_line[1][0] if len(first_line) > 1 else ""
269
+
270
+ # Simple heuristics: single short lines without periods are likely headers
271
+ if len(lines) == 1:
272
+ if len(text) < self.DEFAULT_HEADER_MAX_LENGTH and not text.endswith('.'):
273
+ return "header"
274
+
275
+ # Default to paragraph
276
+ return "paragraph"
277
+
278
+ def _create_block(
279
+ self,
280
+ block_id: str,
281
+ block_type: str,
282
+ lines: List,
283
+ line_indices: Optional[List[int]] = None
284
+ ) -> Dict:
285
+ """Create a block structure from OCR lines"""
286
+ if line_indices is None:
287
+ line_indices = list(range(len(lines)))
288
+
289
+ block_lines = []
290
+ all_points = []
291
+
292
+ for idx, line_data in zip(line_indices, lines):
293
+ bbox = line_data[0]
294
+ text_tuple = line_data[1]
295
+ text = text_tuple[0] if isinstance(text_tuple, tuple) else text_tuple
296
+ confidence = text_tuple[1] if isinstance(text_tuple, tuple) and len(text_tuple) > 1 else 0.95
297
+
298
+ # Convert bbox to proper format
299
+ line_bbox = self._normalize_bbox(bbox)
300
+ all_points.extend(line_bbox)
301
+
302
+ # Estimate font size from bbox height
303
+ font_size = self._estimate_font_size(line_bbox)
304
+
305
+ # Process words
306
+ words = self._extract_words_from_line(text, line_bbox, confidence)
307
+
308
+ block_lines.append({
309
+ "line_id": f"line_{idx}",
310
+ "text": text,
311
+ "bounding_box": line_bbox,
312
+ "font_size_estimate": font_size,
313
+ "words": words
314
+ })
315
+
316
+ # Calculate block bounding box from all lines
317
+ block_bbox = self._calculate_enclosing_bbox(all_points)
318
+
319
+ return {
320
+ "block_id": block_id,
321
+ "block_type": block_type,
322
+ "bounding_box": block_bbox,
323
+ "lines": block_lines
324
+ }
325
+
326
+ def _extract_words_from_line(
327
+ self,
328
+ text: str,
329
+ line_bbox: List[List[int]],
330
+ line_confidence: float
331
+ ) -> List[Dict]:
332
+ """
333
+ Extract words from line and approximate their bounding boxes
334
+ """
335
+ words = text.split()
336
+ if not words:
337
+ return []
338
+
339
+ # Calculate line dimensions
340
+ x_coords = [p[0] for p in line_bbox]
341
+ y_coords = [p[1] for p in line_bbox]
342
+ line_width = max(x_coords) - min(x_coords)
343
+ line_height = max(y_coords) - min(y_coords)
344
+ line_x_start = min(x_coords)
345
+ line_y_min = min(y_coords)
346
+
347
+ # Calculate total character count (including spaces)
348
+ total_chars = len(text)
349
+
350
+ word_list = []
351
+ char_position = 0
352
+
353
+ for word in words:
354
+ # Calculate word position proportionally
355
+ word_start_ratio = char_position / total_chars if total_chars > 0 else 0
356
+ word_end_ratio = (char_position + len(word)) / total_chars if total_chars > 0 else 0
357
+
358
+ word_x_start = line_x_start + int(line_width * word_start_ratio)
359
+ word_x_end = line_x_start + int(line_width * word_end_ratio)
360
+
361
+ # Create word bounding box (simplified rectangle)
362
+ word_bbox = [
363
+ [word_x_start, line_y_min],
364
+ [word_x_end, line_y_min],
365
+ [word_x_end, line_y_min + line_height],
366
+ [word_x_start, line_y_min + line_height]
367
+ ]
368
+
369
+ # Extract characters
370
+ characters = self._extract_characters_from_word(
371
+ word,
372
+ word_bbox,
373
+ line_confidence
374
+ )
375
+
376
+ word_list.append({
377
+ "word": word,
378
+ "bounding_box": word_bbox,
379
+ "confidence": line_confidence,
380
+ "characters": characters
381
+ })
382
+
383
+ # Move position forward (word + space)
384
+ char_position += len(word) + 1
385
+
386
+ return word_list
387
+
388
+ def _extract_characters_from_word(
389
+ self,
390
+ word: str,
391
+ word_bbox: List[List[int]],
392
+ confidence: float
393
+ ) -> List[Dict]:
394
+ """
395
+ Extract individual characters and approximate their bounding boxes
396
+ """
397
+ if not word:
398
+ return []
399
+
400
+ x_coords = [p[0] for p in word_bbox]
401
+ y_coords = [p[1] for p in word_bbox]
402
+ word_width = max(x_coords) - min(x_coords)
403
+ word_height = max(y_coords) - min(y_coords)
404
+ word_x_start = min(x_coords)
405
+ word_y_min = min(y_coords)
406
+
407
+ char_list = []
408
+ num_chars = len(word)
409
+
410
+ for i, char in enumerate(word):
411
+ # Calculate character position proportionally
412
+ char_start_ratio = i / num_chars
413
+ char_end_ratio = (i + 1) / num_chars
414
+
415
+ char_x_start = word_x_start + int(word_width * char_start_ratio)
416
+ char_x_end = word_x_start + int(word_width * char_end_ratio)
417
+
418
+ # Create character bounding box
419
+ char_bbox = [
420
+ [char_x_start, word_y_min],
421
+ [char_x_end, word_y_min],
422
+ [char_x_end, word_y_min + word_height],
423
+ [char_x_start, word_y_min + word_height]
424
+ ]
425
+
426
+ char_list.append({
427
+ "char": char,
428
+ "bounding_box": char_bbox,
429
+ "confidence": confidence
430
+ })
431
+
432
+ return char_list
433
+
434
+ def _normalize_bbox(self, bbox: List) -> List[List[int]]:
435
+ """Normalize bounding box to list of [x, y] coordinates"""
436
+ if isinstance(bbox[0], (list, tuple)) and len(bbox[0]) == 2:
437
+ # Already in correct format
438
+ return [[int(p[0]), int(p[1])] for p in bbox]
439
+ else:
440
+ # Convert from other formats
441
+ return [[int(bbox[0]), int(bbox[1])],
442
+ [int(bbox[2]), int(bbox[1])],
443
+ [int(bbox[2]), int(bbox[3])],
444
+ [int(bbox[0]), int(bbox[3])]]
445
+
446
+ def _estimate_font_size(self, bbox: List[List[int]]) -> int:
447
+ """
448
+ Estimate font size based on bounding box height
449
+ Simple heuristic: height in pixels approximates font size in points
450
+ Typical ratio: 1 point ≈ 1.333 pixels at 96 DPI
451
+ """
452
+ y_coords = [p[1] for p in bbox]
453
+ height = max(y_coords) - min(y_coords)
454
+ # Convert pixel height to approximate font size
455
+ font_size = int(height * 0.75)
456
+ # Clamp between reasonable font size bounds
457
+ return max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size))
458
+
459
+ def _calculate_enclosing_bbox(self, points: List[List[int]]) -> List[List[int]]:
460
+ """Calculate the minimum enclosing bounding box for a set of points"""
461
+ if not points:
462
+ return [[0, 0], [0, 0], [0, 0], [0, 0]]
463
+
464
+ x_coords = [p[0] for p in points]
465
+ y_coords = [p[1] for p in points]
466
+
467
+ min_x, max_x = min(x_coords), max(x_coords)
468
+ min_y, max_y = min(y_coords), max(y_coords)
469
+
470
+ return [
471
+ [min_x, min_y],
472
+ [max_x, min_y],
473
+ [max_x, max_y],
474
+ [min_x, max_y]
475
+ ]
476
+
477
+ def _get_bbox_center(self, bbox: List) -> Tuple[float, float]:
478
+ """Get center point of bounding box"""
479
+ if isinstance(bbox[0], (list, tuple)):
480
+ x_coords = [p[0] for p in bbox]
481
+ y_coords = [p[1] for p in bbox]
482
+ else:
483
+ x_coords = [bbox[0], bbox[2]]
484
+ y_coords = [bbox[1], bbox[3]]
485
+
486
+ return (sum(x_coords) / len(x_coords), sum(y_coords) / len(y_coords))
487
+
488
+ def _point_in_bbox(self, point: Tuple[float, float], bbox: List) -> bool:
489
+ """Check if a point is inside a bounding box"""
490
+ x, y = point
491
+ if len(bbox) == 4 and not isinstance(bbox[0], (list, tuple)):
492
+ # [x1, y1, x2, y2] format
493
+ return bbox[0] <= x <= bbox[2] and bbox[1] <= y <= bbox[3]
494
+ return False