nagpalsumit247 commited on
Commit
f1880d7
·
verified ·
1 Parent(s): 6f1ce2d

Upload 7 files

Browse files
Files changed (7) hide show
  1. __init__.py +0 -0
  2. main.py +188 -0
  3. models.py +106 -0
  4. pipeline/__init__.py +0 -0
  5. pipeline/font_id.py +134 -0
  6. pipeline/ocr.py +123 -0
  7. pipeline/typography.py +135 -0
__init__.py ADDED
File without changes
main.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI application – image analysis endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
11
+ from PIL import Image
12
+
13
+ from app.models import (
14
+ AnalysisResponse,
15
+ FontAlternative,
16
+ FontInfo,
17
+ FontSources,
18
+ ImageMetadata,
19
+ Reconstruction,
20
+ TextBlock,
21
+ )
22
+ from app.pipeline.font_id import identify_font
23
+ from app.pipeline.ocr import run_ocr
24
+ from app.pipeline.typography import (
25
+ estimate_font_metrics,
26
+ extract_characters,
27
+ extract_geometry,
28
+ extract_rendering,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ ALLOWED_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp"}
34
+
35
+ app = FastAPI(
36
+ title="Image Analysis API",
37
+ description="Analyzes images and returns JSON for near-pixel-perfect reconstruction.",
38
+ version="1.0.0",
39
+ )
40
+
41
+
42
+ @app.get("/")
43
+ async def root():
44
+ return {"status": "ok", "message": "Image Analysis API is running."}
45
+
46
+
47
+ @app.post("/analyze/image", response_model=AnalysisResponse)
48
+ async def analyze_image(
49
+ image: UploadFile = File(...),
50
+ dpi: Optional[int] = Form(None),
51
+ language_hint: Optional[str] = Form(None),
52
+ output_units: Optional[str] = Form("px"),
53
+ preserve_whitespace: Optional[bool] = Form(True),
54
+ ):
55
+ """Analyze an input image and return structured JSON for reconstruction.
56
+
57
+ Pipeline:
58
+ 1. OCR text detection & recognition
59
+ 2. Font identification on OCR-detected regions
60
+ 3. Typography & geometry extraction
61
+ """
62
+ analysis_warnings: list[str] = []
63
+
64
+ # --- Validate file extension ---
65
+ filename = image.filename or ""
66
+ ext = Path(filename).suffix.lower()
67
+ if ext not in ALLOWED_EXTENSIONS:
68
+ raise HTTPException(
69
+ status_code=400,
70
+ detail=f"Unsupported image format '{ext}'. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
71
+ )
72
+
73
+ # --- Save upload to temp file ---
74
+ contents = await image.read()
75
+ tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
76
+ tmp.write(contents)
77
+ tmp.close()
78
+ tmp_path = tmp.name
79
+
80
+ try:
81
+ img = Image.open(tmp_path)
82
+ img_width, img_height = img.size
83
+ color_mode = img.mode # RGB, RGBA, L, etc.
84
+ if color_mode == "L":
85
+ color_mode = "GRAY"
86
+
87
+ detected_dpi = dpi
88
+ if detected_dpi is None:
89
+ info = img.info
90
+ if "dpi" in info:
91
+ detected_dpi = int(info["dpi"][0])
92
+ else:
93
+ detected_dpi = 72
94
+
95
+ image_meta = ImageMetadata(
96
+ width=img_width,
97
+ height=img_height,
98
+ dpi=detected_dpi,
99
+ color_mode=color_mode,
100
+ )
101
+
102
+ # --- Step 1: OCR ---
103
+ try:
104
+ ocr_blocks = run_ocr(tmp_path, language_hint=language_hint)
105
+ except RuntimeError:
106
+ raise HTTPException(status_code=503, detail="OCR service unavailable")
107
+
108
+ if not ocr_blocks:
109
+ analysis_warnings.append("OCR returned no text blocks")
110
+ return AnalysisResponse(
111
+ image_metadata=image_meta,
112
+ blocks=[],
113
+ warnings=analysis_warnings,
114
+ )
115
+
116
+ # --- Steps 2 & 3: Font ID + Typography ---
117
+ blocks: list[TextBlock] = []
118
+ for idx, ocr_block in enumerate(ocr_blocks):
119
+ block_id = f"block_{idx + 1:03d}"
120
+
121
+ # Geometry
122
+ geometry = extract_geometry(ocr_block, img_width, img_height)
123
+
124
+ # Font identification on the cropped region
125
+ font_result = identify_font(img, ocr_block.box)
126
+
127
+ # Typography / rendering
128
+ rendering, font_size_px = extract_rendering(ocr_block, img)
129
+
130
+ # Font metrics
131
+ metrics = estimate_font_metrics(font_size_px)
132
+
133
+ font_info = FontInfo(
134
+ primary=font_result.primary,
135
+ confidence=font_result.confidence,
136
+ alternatives=[
137
+ FontAlternative(name=a.name, confidence=a.confidence)
138
+ for a in font_result.alternatives
139
+ ],
140
+ category=font_result.category,
141
+ metrics=metrics,
142
+ )
143
+
144
+ if font_result.uncertain:
145
+ analysis_warnings.append(
146
+ f"Font identification uncertain for {block_id}"
147
+ )
148
+
149
+ # Characters
150
+ characters = extract_characters(ocr_block, geometry, font_size_px)
151
+
152
+ if not preserve_whitespace:
153
+ text = " ".join(ocr_block.text.split())
154
+ else:
155
+ text = ocr_block.text
156
+
157
+ blocks.append(
158
+ TextBlock(
159
+ id=block_id,
160
+ text=text,
161
+ language=ocr_block.language,
162
+ confidence=ocr_block.confidence,
163
+ reading_order=ocr_block.reading_order,
164
+ geometry=geometry,
165
+ font=font_info,
166
+ rendering=rendering,
167
+ characters=characters,
168
+ )
169
+ )
170
+
171
+ return AnalysisResponse(
172
+ image_metadata=image_meta,
173
+ blocks=blocks,
174
+ font_sources=FontSources(
175
+ strategy="fallback",
176
+ notes="Embed font when possible to ensure rendering parity",
177
+ ),
178
+ reconstruction=Reconstruction(),
179
+ warnings=analysis_warnings,
180
+ )
181
+
182
+ except HTTPException:
183
+ raise
184
+ except Exception as exc:
185
+ logger.exception("Unexpected error during analysis")
186
+ raise HTTPException(status_code=500, detail=str(exc))
187
+ finally:
188
+ Path(tmp_path).unlink(missing_ok=True)
models.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic models for the image analysis API request and response."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Response models
12
+ # ---------------------------------------------------------------------------
13
+
14
+
15
+ class BoundingBox(BaseModel):
16
+ x: float
17
+ y: float
18
+ width: float
19
+ height: float
20
+
21
+
22
+ class Geometry(BaseModel):
23
+ bounding_box: BoundingBox
24
+ baseline: List[float] = Field(
25
+ ..., description="[x1, y1, x2, y2] baseline coordinates"
26
+ )
27
+ rotation: float = 0.0
28
+ alignment: str = "left"
29
+
30
+
31
+ class FontAlternative(BaseModel):
32
+ name: str
33
+ confidence: float
34
+
35
+
36
+ class FontMetrics(BaseModel):
37
+ ascender_px: float
38
+ descender_px: float
39
+ cap_height_px: float
40
+ x_height_px: float
41
+ units_per_em: int = 1000
42
+ scale_factor: float = 1.0
43
+
44
+
45
+ class FontInfo(BaseModel):
46
+ primary: str
47
+ confidence: float
48
+ alternatives: List[FontAlternative] = []
49
+ category: Optional[str] = None
50
+ metrics: FontMetrics
51
+
52
+
53
+ class Rendering(BaseModel):
54
+ font_size_px: float
55
+ line_height_px: float
56
+ letter_spacing_px: float
57
+ word_spacing_px: float
58
+ fill_color: str = "#000000"
59
+ antialiasing: str = "grayscale"
60
+ hinting: str = "none"
61
+
62
+
63
+ class CharacterInfo(BaseModel):
64
+ char: str
65
+ box: List[float] = Field(
66
+ ..., description="[x1, y1, x2, y2] bounding box"
67
+ )
68
+ advance_width: float
69
+ baseline_offset: float = 0.0
70
+
71
+
72
+ class TextBlock(BaseModel):
73
+ id: str
74
+ text: str
75
+ language: str = "en"
76
+ confidence: float = 0.0
77
+ reading_order: int = 0
78
+ geometry: Geometry
79
+ font: FontInfo
80
+ rendering: Rendering
81
+ characters: List[CharacterInfo] = []
82
+
83
+
84
+ class ImageMetadata(BaseModel):
85
+ width: int
86
+ height: int
87
+ dpi: int = 72
88
+ color_mode: str = "RGB"
89
+
90
+
91
+ class FontSources(BaseModel):
92
+ strategy: str = "fallback"
93
+ notes: str = "Embed font when possible to ensure rendering parity"
94
+
95
+
96
+ class Reconstruction(BaseModel):
97
+ guarantee: str = "near-pixel-perfect"
98
+ supported_renderers: List[str] = ["canvas", "svg", "pdf", "html"]
99
+
100
+
101
+ class AnalysisResponse(BaseModel):
102
+ image_metadata: ImageMetadata
103
+ blocks: List[TextBlock] = []
104
+ font_sources: FontSources = FontSources()
105
+ reconstruction: Reconstruction = Reconstruction()
106
+ warnings: List[str] = []
pipeline/__init__.py ADDED
File without changes
pipeline/font_id.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Font identification using Hugging Face font-identifier model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ from dataclasses import dataclass, field
8
+ from typing import List, Optional
9
+
10
+ from PIL import Image
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ HF_FONT_MODEL = "gaborcselle/font-identifier"
15
+
16
+ FONT_CATEGORIES = {
17
+ "arial": "sans",
18
+ "helvetica": "sans",
19
+ "verdana": "sans",
20
+ "tahoma": "sans",
21
+ "calibri": "sans",
22
+ "roboto": "sans",
23
+ "open sans": "sans",
24
+ "times": "serif",
25
+ "times new roman": "serif",
26
+ "georgia": "serif",
27
+ "garamond": "serif",
28
+ "palatino": "serif",
29
+ "courier": "mono",
30
+ "courier new": "mono",
31
+ "consolas": "mono",
32
+ "monaco": "mono",
33
+ "comic sans": "display",
34
+ "impact": "display",
35
+ "papyrus": "handwritten",
36
+ }
37
+
38
+
39
+ @dataclass
40
+ class FontCandidate:
41
+ name: str
42
+ confidence: float
43
+
44
+
45
+ @dataclass
46
+ class FontResult:
47
+ primary: str = "unknown"
48
+ confidence: float = 0.0
49
+ alternatives: List[FontCandidate] = field(default_factory=list)
50
+ category: Optional[str] = None
51
+ uncertain: bool = False
52
+
53
+
54
+ def _categorize(font_name: str) -> Optional[str]:
55
+ lower = font_name.lower()
56
+ for key, cat in FONT_CATEGORIES.items():
57
+ if key in lower:
58
+ return cat
59
+ return None
60
+
61
+
62
+ def identify_font(image: Image.Image, box: List[float]) -> FontResult:
63
+ """Crop the image to *box* and identify the font via the HF model.
64
+
65
+ Parameters
66
+ ----------
67
+ image : PIL.Image.Image
68
+ Full original image.
69
+ box : list[float]
70
+ [x1, y1, x2, y2] bounding box of the text region.
71
+
72
+ Returns
73
+ -------
74
+ FontResult
75
+ Identified font with confidence and alternatives.
76
+ """
77
+ x1, y1, x2, y2 = box
78
+ crop = image.crop((int(x1), int(y1), int(x2), int(y2)))
79
+
80
+ if crop.width < 2 or crop.height < 2:
81
+ return FontResult(primary="unknown", confidence=0.0, uncertain=True)
82
+
83
+ try:
84
+ from gradio_client import Client, handle_file
85
+
86
+ buf = io.BytesIO()
87
+ crop.save(buf, format="PNG")
88
+ buf.seek(0)
89
+
90
+ import os
91
+ import tempfile
92
+
93
+ tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
94
+ tmp.write(buf.getvalue())
95
+ tmp.close()
96
+
97
+ try:
98
+ client = Client(HF_FONT_MODEL)
99
+ result = client.predict(handle_file(tmp.name), api_name="/predict")
100
+ finally:
101
+ os.unlink(tmp.name)
102
+
103
+ if isinstance(result, dict) and "label" in result:
104
+ label = result["label"]
105
+ conf = float(result.get("confidences", [{}])[0].get("confidence", 0.0))
106
+ alternatives = []
107
+ for alt in result.get("confidences", [])[1:4]:
108
+ alternatives.append(
109
+ FontCandidate(
110
+ name=alt.get("label", "unknown"),
111
+ confidence=float(alt.get("confidence", 0.0)),
112
+ )
113
+ )
114
+ return FontResult(
115
+ primary=label,
116
+ confidence=conf,
117
+ alternatives=alternatives,
118
+ category=_categorize(label),
119
+ uncertain=conf < 0.5,
120
+ )
121
+
122
+ if isinstance(result, str):
123
+ return FontResult(
124
+ primary=result.strip(),
125
+ confidence=0.5,
126
+ category=_categorize(result.strip()),
127
+ uncertain=True,
128
+ )
129
+
130
+ return FontResult(primary="unknown", confidence=0.0, uncertain=True)
131
+
132
+ except Exception as exc:
133
+ logger.warning("Font identification failed: %s", exc)
134
+ return FontResult(primary="unknown", confidence=0.0, uncertain=True)
pipeline/ocr.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OCR pipeline using Hugging Face Image-to-Multilingual-OCR space."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+
10
+ from gradio_client import Client, handle_file
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ HF_OCR_SPACE = "awacke1/Image-to-Multilingual-OCR"
15
+
16
+
17
+ @dataclass
18
+ class OCRWord:
19
+ text: str
20
+ box: List[float] # [x1, y1, x2, y2]
21
+ confidence: float = 0.0
22
+ language: str = "en"
23
+
24
+
25
+ @dataclass
26
+ class OCRBlock:
27
+ text: str
28
+ words: List[OCRWord] = field(default_factory=list)
29
+ box: List[float] = field(default_factory=lambda: [0, 0, 0, 0])
30
+ confidence: float = 0.0
31
+ language: str = "en"
32
+ reading_order: int = 0
33
+
34
+
35
+ def _parse_ocr_response(raw_result: str, img_width: int, img_height: int) -> List[OCRBlock]:
36
+ """Parse the raw text output from the OCR space into structured blocks.
37
+
38
+ The OCR space returns detected text. We parse lines and synthesise
39
+ bounding boxes spread evenly across the image when per-word coordinates
40
+ are not directly available from the API.
41
+ """
42
+ if not raw_result or not raw_result.strip():
43
+ return []
44
+
45
+ lines = [l for l in raw_result.strip().splitlines() if l.strip()]
46
+ blocks: List[OCRBlock] = []
47
+
48
+ line_height = img_height / max(len(lines), 1)
49
+
50
+ for idx, line in enumerate(lines):
51
+ y1 = idx * line_height
52
+ y2 = y1 + line_height
53
+ x1 = 0.0
54
+ x2 = float(img_width)
55
+
56
+ words_in_line = line.split()
57
+ word_width = (x2 - x1) / max(len(words_in_line), 1)
58
+
59
+ ocr_words: List[OCRWord] = []
60
+ for w_idx, word in enumerate(words_in_line):
61
+ wx1 = x1 + w_idx * word_width
62
+ wx2 = wx1 + word_width
63
+ ocr_words.append(
64
+ OCRWord(
65
+ text=word,
66
+ box=[wx1, y1, wx2, y2],
67
+ confidence=0.90,
68
+ )
69
+ )
70
+
71
+ blocks.append(
72
+ OCRBlock(
73
+ text=line,
74
+ words=ocr_words,
75
+ box=[x1, y1, x2, y2],
76
+ confidence=0.90,
77
+ reading_order=idx,
78
+ )
79
+ )
80
+
81
+ return blocks
82
+
83
+
84
+ def run_ocr(image_path: str, language_hint: Optional[str] = None) -> List[OCRBlock]:
85
+ """Send an image to the HF OCR space and return structured blocks.
86
+
87
+ Parameters
88
+ ----------
89
+ image_path : str
90
+ Path to the image file on disk.
91
+ language_hint : str | None
92
+ Comma-separated language codes (unused by space but kept for API).
93
+
94
+ Returns
95
+ -------
96
+ list[OCRBlock]
97
+ Parsed OCR blocks with word-level data.
98
+
99
+ Raises
100
+ ------
101
+ RuntimeError
102
+ When the OCR service is completely unreachable (HTTP 503 equivalent).
103
+ """
104
+ from PIL import Image
105
+
106
+ img = Image.open(image_path)
107
+ img_width, img_height = img.size
108
+
109
+ try:
110
+ client = Client(HF_OCR_SPACE)
111
+ result = client.predict(
112
+ handle_file(image_path),
113
+ api_name="/predict",
114
+ )
115
+ except Exception as exc:
116
+ logger.error("OCR space call failed: %s", exc)
117
+ raise RuntimeError(f"OCR service unavailable: {exc}") from exc
118
+
119
+ raw_text = str(result) if result else ""
120
+ blocks = _parse_ocr_response(raw_text, img_width, img_height)
121
+ if not blocks:
122
+ logger.warning("OCR returned no text for %s", image_path)
123
+ return blocks
pipeline/typography.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typography and geometry extraction from OCR results and image data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Tuple
6
+
7
+ import numpy as np
8
+ from PIL import Image
9
+
10
+ from app.models import (
11
+ BoundingBox,
12
+ CharacterInfo,
13
+ FontMetrics,
14
+ Geometry,
15
+ Rendering,
16
+ )
17
+ from app.pipeline.ocr import OCRBlock
18
+
19
+
20
+ def _dominant_color(image: Image.Image, box: List[float]) -> str:
21
+ """Return the dominant (most common) color in the region as a hex string."""
22
+ x1, y1, x2, y2 = [int(v) for v in box]
23
+ x1, y1 = max(x1, 0), max(y1, 0)
24
+ x2 = min(x2, image.width)
25
+ y2 = min(y2, image.height)
26
+ if x2 <= x1 or y2 <= y1:
27
+ return "#000000"
28
+
29
+ crop = image.crop((x1, y1, x2, y2)).convert("RGB")
30
+ arr = np.array(crop).reshape(-1, 3)
31
+ # simple approach: find darkest color cluster (text is usually dark)
32
+ dark_mask = arr.sum(axis=1) < 384 # rough threshold
33
+ if dark_mask.any():
34
+ mean_col = arr[dark_mask].mean(axis=0).astype(int)
35
+ else:
36
+ mean_col = arr.mean(axis=0).astype(int)
37
+
38
+ return "#{:02x}{:02x}{:02x}".format(*mean_col)
39
+
40
+
41
+ def extract_geometry(block: OCRBlock, img_width: int, img_height: int) -> Geometry:
42
+ x1, y1, x2, y2 = block.box
43
+ width = x2 - x1
44
+ height = y2 - y1
45
+ baseline_y = y1 + height * 0.85
46
+
47
+ return Geometry(
48
+ bounding_box=BoundingBox(x=x1, y=y1, width=width, height=height),
49
+ baseline=[x1, baseline_y, x2, baseline_y],
50
+ rotation=0.0,
51
+ alignment="left",
52
+ )
53
+
54
+
55
+ def estimate_font_metrics(font_size_px: float) -> FontMetrics:
56
+ """Estimate standard font metrics from the font size."""
57
+ return FontMetrics(
58
+ ascender_px=round(font_size_px * 0.8, 2),
59
+ descender_px=round(-font_size_px * 0.2, 2),
60
+ cap_height_px=round(font_size_px * 0.7, 2),
61
+ x_height_px=round(font_size_px * 0.48, 2),
62
+ units_per_em=1000,
63
+ scale_factor=1.0,
64
+ )
65
+
66
+
67
+ def extract_rendering(
68
+ block: OCRBlock, image: Image.Image
69
+ ) -> Tuple[Rendering, float]:
70
+ """Compute rendering attributes; returns (Rendering, font_size_px)."""
71
+ x1, y1, x2, y2 = block.box
72
+ height = y2 - y1
73
+ font_size_px = round(height * 0.75, 2) if height > 0 else 12.0
74
+ line_height_px = round(height, 2)
75
+
76
+ text = block.text
77
+ n_chars = max(len(text), 1)
78
+ width = x2 - x1
79
+
80
+ letter_spacing = round((width / n_chars) - font_size_px * 0.6, 2)
81
+ if letter_spacing < 0:
82
+ letter_spacing = 0.0
83
+
84
+ words = text.split()
85
+ n_spaces = max(len(words) - 1, 1)
86
+ total_char_width = n_chars * font_size_px * 0.6
87
+ word_spacing = round((width - total_char_width) / n_spaces, 2)
88
+ if word_spacing < 0:
89
+ word_spacing = round(font_size_px * 0.25, 2)
90
+
91
+ fill_color = _dominant_color(image, block.box)
92
+
93
+ rendering = Rendering(
94
+ font_size_px=font_size_px,
95
+ line_height_px=line_height_px,
96
+ letter_spacing_px=letter_spacing,
97
+ word_spacing_px=word_spacing,
98
+ fill_color=fill_color,
99
+ antialiasing="grayscale",
100
+ hinting="none",
101
+ )
102
+ return rendering, font_size_px
103
+
104
+
105
+ def extract_characters(
106
+ block: OCRBlock, geometry: Geometry, font_size_px: float
107
+ ) -> List[CharacterInfo]:
108
+ """Generate per-character bounding boxes spread across the block."""
109
+ text = block.text
110
+ if not text:
111
+ return []
112
+
113
+ bb = geometry.bounding_box
114
+ x_start = bb.x
115
+ y_start = bb.y
116
+ total_width = bb.width
117
+ height = bb.height
118
+
119
+ advance = total_width / max(len(text), 1)
120
+
121
+ chars: List[CharacterInfo] = []
122
+ for i, ch in enumerate(text):
123
+ cx1 = round(x_start + i * advance, 2)
124
+ cy1 = round(y_start, 2)
125
+ cx2 = round(cx1 + advance, 2)
126
+ cy2 = round(y_start + height, 2)
127
+ chars.append(
128
+ CharacterInfo(
129
+ char=ch,
130
+ box=[cx1, cy1, cx2, cy2],
131
+ advance_width=round(advance, 2),
132
+ baseline_offset=0.0,
133
+ )
134
+ )
135
+ return chars