Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- __init__.py +0 -0
- main.py +188 -0
- models.py +106 -0
- pipeline/__init__.py +0 -0
- pipeline/font_id.py +134 -0
- pipeline/ocr.py +123 -0
- pipeline/typography.py +135 -0
__init__.py
ADDED
|
File without changes
|
main.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application – image analysis endpoint."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
| 11 |
+
from PIL import Image
|
| 12 |
+
|
| 13 |
+
from app.models import (
|
| 14 |
+
AnalysisResponse,
|
| 15 |
+
FontAlternative,
|
| 16 |
+
FontInfo,
|
| 17 |
+
FontSources,
|
| 18 |
+
ImageMetadata,
|
| 19 |
+
Reconstruction,
|
| 20 |
+
TextBlock,
|
| 21 |
+
)
|
| 22 |
+
from app.pipeline.font_id import identify_font
|
| 23 |
+
from app.pipeline.ocr import run_ocr
|
| 24 |
+
from app.pipeline.typography import (
|
| 25 |
+
estimate_font_metrics,
|
| 26 |
+
extract_characters,
|
| 27 |
+
extract_geometry,
|
| 28 |
+
extract_rendering,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
ALLOWED_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp"}
|
| 34 |
+
|
| 35 |
+
app = FastAPI(
|
| 36 |
+
title="Image Analysis API",
|
| 37 |
+
description="Analyzes images and returns JSON for near-pixel-perfect reconstruction.",
|
| 38 |
+
version="1.0.0",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@app.get("/")
|
| 43 |
+
async def root():
|
| 44 |
+
return {"status": "ok", "message": "Image Analysis API is running."}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@app.post("/analyze/image", response_model=AnalysisResponse)
|
| 48 |
+
async def analyze_image(
|
| 49 |
+
image: UploadFile = File(...),
|
| 50 |
+
dpi: Optional[int] = Form(None),
|
| 51 |
+
language_hint: Optional[str] = Form(None),
|
| 52 |
+
output_units: Optional[str] = Form("px"),
|
| 53 |
+
preserve_whitespace: Optional[bool] = Form(True),
|
| 54 |
+
):
|
| 55 |
+
"""Analyze an input image and return structured JSON for reconstruction.
|
| 56 |
+
|
| 57 |
+
Pipeline:
|
| 58 |
+
1. OCR text detection & recognition
|
| 59 |
+
2. Font identification on OCR-detected regions
|
| 60 |
+
3. Typography & geometry extraction
|
| 61 |
+
"""
|
| 62 |
+
analysis_warnings: list[str] = []
|
| 63 |
+
|
| 64 |
+
# --- Validate file extension ---
|
| 65 |
+
filename = image.filename or ""
|
| 66 |
+
ext = Path(filename).suffix.lower()
|
| 67 |
+
if ext not in ALLOWED_EXTENSIONS:
|
| 68 |
+
raise HTTPException(
|
| 69 |
+
status_code=400,
|
| 70 |
+
detail=f"Unsupported image format '{ext}'. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# --- Save upload to temp file ---
|
| 74 |
+
contents = await image.read()
|
| 75 |
+
tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
| 76 |
+
tmp.write(contents)
|
| 77 |
+
tmp.close()
|
| 78 |
+
tmp_path = tmp.name
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
img = Image.open(tmp_path)
|
| 82 |
+
img_width, img_height = img.size
|
| 83 |
+
color_mode = img.mode # RGB, RGBA, L, etc.
|
| 84 |
+
if color_mode == "L":
|
| 85 |
+
color_mode = "GRAY"
|
| 86 |
+
|
| 87 |
+
detected_dpi = dpi
|
| 88 |
+
if detected_dpi is None:
|
| 89 |
+
info = img.info
|
| 90 |
+
if "dpi" in info:
|
| 91 |
+
detected_dpi = int(info["dpi"][0])
|
| 92 |
+
else:
|
| 93 |
+
detected_dpi = 72
|
| 94 |
+
|
| 95 |
+
image_meta = ImageMetadata(
|
| 96 |
+
width=img_width,
|
| 97 |
+
height=img_height,
|
| 98 |
+
dpi=detected_dpi,
|
| 99 |
+
color_mode=color_mode,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# --- Step 1: OCR ---
|
| 103 |
+
try:
|
| 104 |
+
ocr_blocks = run_ocr(tmp_path, language_hint=language_hint)
|
| 105 |
+
except RuntimeError:
|
| 106 |
+
raise HTTPException(status_code=503, detail="OCR service unavailable")
|
| 107 |
+
|
| 108 |
+
if not ocr_blocks:
|
| 109 |
+
analysis_warnings.append("OCR returned no text blocks")
|
| 110 |
+
return AnalysisResponse(
|
| 111 |
+
image_metadata=image_meta,
|
| 112 |
+
blocks=[],
|
| 113 |
+
warnings=analysis_warnings,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# --- Steps 2 & 3: Font ID + Typography ---
|
| 117 |
+
blocks: list[TextBlock] = []
|
| 118 |
+
for idx, ocr_block in enumerate(ocr_blocks):
|
| 119 |
+
block_id = f"block_{idx + 1:03d}"
|
| 120 |
+
|
| 121 |
+
# Geometry
|
| 122 |
+
geometry = extract_geometry(ocr_block, img_width, img_height)
|
| 123 |
+
|
| 124 |
+
# Font identification on the cropped region
|
| 125 |
+
font_result = identify_font(img, ocr_block.box)
|
| 126 |
+
|
| 127 |
+
# Typography / rendering
|
| 128 |
+
rendering, font_size_px = extract_rendering(ocr_block, img)
|
| 129 |
+
|
| 130 |
+
# Font metrics
|
| 131 |
+
metrics = estimate_font_metrics(font_size_px)
|
| 132 |
+
|
| 133 |
+
font_info = FontInfo(
|
| 134 |
+
primary=font_result.primary,
|
| 135 |
+
confidence=font_result.confidence,
|
| 136 |
+
alternatives=[
|
| 137 |
+
FontAlternative(name=a.name, confidence=a.confidence)
|
| 138 |
+
for a in font_result.alternatives
|
| 139 |
+
],
|
| 140 |
+
category=font_result.category,
|
| 141 |
+
metrics=metrics,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if font_result.uncertain:
|
| 145 |
+
analysis_warnings.append(
|
| 146 |
+
f"Font identification uncertain for {block_id}"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Characters
|
| 150 |
+
characters = extract_characters(ocr_block, geometry, font_size_px)
|
| 151 |
+
|
| 152 |
+
if not preserve_whitespace:
|
| 153 |
+
text = " ".join(ocr_block.text.split())
|
| 154 |
+
else:
|
| 155 |
+
text = ocr_block.text
|
| 156 |
+
|
| 157 |
+
blocks.append(
|
| 158 |
+
TextBlock(
|
| 159 |
+
id=block_id,
|
| 160 |
+
text=text,
|
| 161 |
+
language=ocr_block.language,
|
| 162 |
+
confidence=ocr_block.confidence,
|
| 163 |
+
reading_order=ocr_block.reading_order,
|
| 164 |
+
geometry=geometry,
|
| 165 |
+
font=font_info,
|
| 166 |
+
rendering=rendering,
|
| 167 |
+
characters=characters,
|
| 168 |
+
)
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
return AnalysisResponse(
|
| 172 |
+
image_metadata=image_meta,
|
| 173 |
+
blocks=blocks,
|
| 174 |
+
font_sources=FontSources(
|
| 175 |
+
strategy="fallback",
|
| 176 |
+
notes="Embed font when possible to ensure rendering parity",
|
| 177 |
+
),
|
| 178 |
+
reconstruction=Reconstruction(),
|
| 179 |
+
warnings=analysis_warnings,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
except HTTPException:
|
| 183 |
+
raise
|
| 184 |
+
except Exception as exc:
|
| 185 |
+
logger.exception("Unexpected error during analysis")
|
| 186 |
+
raise HTTPException(status_code=500, detail=str(exc))
|
| 187 |
+
finally:
|
| 188 |
+
Path(tmp_path).unlink(missing_ok=True)
|
models.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic models for the image analysis API request and response."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# ---------------------------------------------------------------------------
|
| 11 |
+
# Response models
|
| 12 |
+
# ---------------------------------------------------------------------------
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BoundingBox(BaseModel):
|
| 16 |
+
x: float
|
| 17 |
+
y: float
|
| 18 |
+
width: float
|
| 19 |
+
height: float
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Geometry(BaseModel):
|
| 23 |
+
bounding_box: BoundingBox
|
| 24 |
+
baseline: List[float] = Field(
|
| 25 |
+
..., description="[x1, y1, x2, y2] baseline coordinates"
|
| 26 |
+
)
|
| 27 |
+
rotation: float = 0.0
|
| 28 |
+
alignment: str = "left"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class FontAlternative(BaseModel):
|
| 32 |
+
name: str
|
| 33 |
+
confidence: float
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class FontMetrics(BaseModel):
|
| 37 |
+
ascender_px: float
|
| 38 |
+
descender_px: float
|
| 39 |
+
cap_height_px: float
|
| 40 |
+
x_height_px: float
|
| 41 |
+
units_per_em: int = 1000
|
| 42 |
+
scale_factor: float = 1.0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class FontInfo(BaseModel):
|
| 46 |
+
primary: str
|
| 47 |
+
confidence: float
|
| 48 |
+
alternatives: List[FontAlternative] = []
|
| 49 |
+
category: Optional[str] = None
|
| 50 |
+
metrics: FontMetrics
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Rendering(BaseModel):
|
| 54 |
+
font_size_px: float
|
| 55 |
+
line_height_px: float
|
| 56 |
+
letter_spacing_px: float
|
| 57 |
+
word_spacing_px: float
|
| 58 |
+
fill_color: str = "#000000"
|
| 59 |
+
antialiasing: str = "grayscale"
|
| 60 |
+
hinting: str = "none"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class CharacterInfo(BaseModel):
|
| 64 |
+
char: str
|
| 65 |
+
box: List[float] = Field(
|
| 66 |
+
..., description="[x1, y1, x2, y2] bounding box"
|
| 67 |
+
)
|
| 68 |
+
advance_width: float
|
| 69 |
+
baseline_offset: float = 0.0
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class TextBlock(BaseModel):
|
| 73 |
+
id: str
|
| 74 |
+
text: str
|
| 75 |
+
language: str = "en"
|
| 76 |
+
confidence: float = 0.0
|
| 77 |
+
reading_order: int = 0
|
| 78 |
+
geometry: Geometry
|
| 79 |
+
font: FontInfo
|
| 80 |
+
rendering: Rendering
|
| 81 |
+
characters: List[CharacterInfo] = []
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class ImageMetadata(BaseModel):
|
| 85 |
+
width: int
|
| 86 |
+
height: int
|
| 87 |
+
dpi: int = 72
|
| 88 |
+
color_mode: str = "RGB"
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class FontSources(BaseModel):
|
| 92 |
+
strategy: str = "fallback"
|
| 93 |
+
notes: str = "Embed font when possible to ensure rendering parity"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class Reconstruction(BaseModel):
|
| 97 |
+
guarantee: str = "near-pixel-perfect"
|
| 98 |
+
supported_renderers: List[str] = ["canvas", "svg", "pdf", "html"]
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class AnalysisResponse(BaseModel):
|
| 102 |
+
image_metadata: ImageMetadata
|
| 103 |
+
blocks: List[TextBlock] = []
|
| 104 |
+
font_sources: FontSources = FontSources()
|
| 105 |
+
reconstruction: Reconstruction = Reconstruction()
|
| 106 |
+
warnings: List[str] = []
|
pipeline/__init__.py
ADDED
|
File without changes
|
pipeline/font_id.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Font identification using Hugging Face font-identifier model."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import logging
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
|
| 10 |
+
from PIL import Image
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
HF_FONT_MODEL = "gaborcselle/font-identifier"
|
| 15 |
+
|
| 16 |
+
FONT_CATEGORIES = {
|
| 17 |
+
"arial": "sans",
|
| 18 |
+
"helvetica": "sans",
|
| 19 |
+
"verdana": "sans",
|
| 20 |
+
"tahoma": "sans",
|
| 21 |
+
"calibri": "sans",
|
| 22 |
+
"roboto": "sans",
|
| 23 |
+
"open sans": "sans",
|
| 24 |
+
"times": "serif",
|
| 25 |
+
"times new roman": "serif",
|
| 26 |
+
"georgia": "serif",
|
| 27 |
+
"garamond": "serif",
|
| 28 |
+
"palatino": "serif",
|
| 29 |
+
"courier": "mono",
|
| 30 |
+
"courier new": "mono",
|
| 31 |
+
"consolas": "mono",
|
| 32 |
+
"monaco": "mono",
|
| 33 |
+
"comic sans": "display",
|
| 34 |
+
"impact": "display",
|
| 35 |
+
"papyrus": "handwritten",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class FontCandidate:
|
| 41 |
+
name: str
|
| 42 |
+
confidence: float
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class FontResult:
|
| 47 |
+
primary: str = "unknown"
|
| 48 |
+
confidence: float = 0.0
|
| 49 |
+
alternatives: List[FontCandidate] = field(default_factory=list)
|
| 50 |
+
category: Optional[str] = None
|
| 51 |
+
uncertain: bool = False
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _categorize(font_name: str) -> Optional[str]:
|
| 55 |
+
lower = font_name.lower()
|
| 56 |
+
for key, cat in FONT_CATEGORIES.items():
|
| 57 |
+
if key in lower:
|
| 58 |
+
return cat
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def identify_font(image: Image.Image, box: List[float]) -> FontResult:
|
| 63 |
+
"""Crop the image to *box* and identify the font via the HF model.
|
| 64 |
+
|
| 65 |
+
Parameters
|
| 66 |
+
----------
|
| 67 |
+
image : PIL.Image.Image
|
| 68 |
+
Full original image.
|
| 69 |
+
box : list[float]
|
| 70 |
+
[x1, y1, x2, y2] bounding box of the text region.
|
| 71 |
+
|
| 72 |
+
Returns
|
| 73 |
+
-------
|
| 74 |
+
FontResult
|
| 75 |
+
Identified font with confidence and alternatives.
|
| 76 |
+
"""
|
| 77 |
+
x1, y1, x2, y2 = box
|
| 78 |
+
crop = image.crop((int(x1), int(y1), int(x2), int(y2)))
|
| 79 |
+
|
| 80 |
+
if crop.width < 2 or crop.height < 2:
|
| 81 |
+
return FontResult(primary="unknown", confidence=0.0, uncertain=True)
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
from gradio_client import Client, handle_file
|
| 85 |
+
|
| 86 |
+
buf = io.BytesIO()
|
| 87 |
+
crop.save(buf, format="PNG")
|
| 88 |
+
buf.seek(0)
|
| 89 |
+
|
| 90 |
+
import os
|
| 91 |
+
import tempfile
|
| 92 |
+
|
| 93 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 94 |
+
tmp.write(buf.getvalue())
|
| 95 |
+
tmp.close()
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
client = Client(HF_FONT_MODEL)
|
| 99 |
+
result = client.predict(handle_file(tmp.name), api_name="/predict")
|
| 100 |
+
finally:
|
| 101 |
+
os.unlink(tmp.name)
|
| 102 |
+
|
| 103 |
+
if isinstance(result, dict) and "label" in result:
|
| 104 |
+
label = result["label"]
|
| 105 |
+
conf = float(result.get("confidences", [{}])[0].get("confidence", 0.0))
|
| 106 |
+
alternatives = []
|
| 107 |
+
for alt in result.get("confidences", [])[1:4]:
|
| 108 |
+
alternatives.append(
|
| 109 |
+
FontCandidate(
|
| 110 |
+
name=alt.get("label", "unknown"),
|
| 111 |
+
confidence=float(alt.get("confidence", 0.0)),
|
| 112 |
+
)
|
| 113 |
+
)
|
| 114 |
+
return FontResult(
|
| 115 |
+
primary=label,
|
| 116 |
+
confidence=conf,
|
| 117 |
+
alternatives=alternatives,
|
| 118 |
+
category=_categorize(label),
|
| 119 |
+
uncertain=conf < 0.5,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
if isinstance(result, str):
|
| 123 |
+
return FontResult(
|
| 124 |
+
primary=result.strip(),
|
| 125 |
+
confidence=0.5,
|
| 126 |
+
category=_categorize(result.strip()),
|
| 127 |
+
uncertain=True,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
return FontResult(primary="unknown", confidence=0.0, uncertain=True)
|
| 131 |
+
|
| 132 |
+
except Exception as exc:
|
| 133 |
+
logger.warning("Font identification failed: %s", exc)
|
| 134 |
+
return FontResult(primary="unknown", confidence=0.0, uncertain=True)
|
pipeline/ocr.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OCR pipeline using Hugging Face Image-to-Multilingual-OCR space."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
|
| 10 |
+
from gradio_client import Client, handle_file
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
HF_OCR_SPACE = "awacke1/Image-to-Multilingual-OCR"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class OCRWord:
|
| 19 |
+
text: str
|
| 20 |
+
box: List[float] # [x1, y1, x2, y2]
|
| 21 |
+
confidence: float = 0.0
|
| 22 |
+
language: str = "en"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class OCRBlock:
|
| 27 |
+
text: str
|
| 28 |
+
words: List[OCRWord] = field(default_factory=list)
|
| 29 |
+
box: List[float] = field(default_factory=lambda: [0, 0, 0, 0])
|
| 30 |
+
confidence: float = 0.0
|
| 31 |
+
language: str = "en"
|
| 32 |
+
reading_order: int = 0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _parse_ocr_response(raw_result: str, img_width: int, img_height: int) -> List[OCRBlock]:
|
| 36 |
+
"""Parse the raw text output from the OCR space into structured blocks.
|
| 37 |
+
|
| 38 |
+
The OCR space returns detected text. We parse lines and synthesise
|
| 39 |
+
bounding boxes spread evenly across the image when per-word coordinates
|
| 40 |
+
are not directly available from the API.
|
| 41 |
+
"""
|
| 42 |
+
if not raw_result or not raw_result.strip():
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
lines = [l for l in raw_result.strip().splitlines() if l.strip()]
|
| 46 |
+
blocks: List[OCRBlock] = []
|
| 47 |
+
|
| 48 |
+
line_height = img_height / max(len(lines), 1)
|
| 49 |
+
|
| 50 |
+
for idx, line in enumerate(lines):
|
| 51 |
+
y1 = idx * line_height
|
| 52 |
+
y2 = y1 + line_height
|
| 53 |
+
x1 = 0.0
|
| 54 |
+
x2 = float(img_width)
|
| 55 |
+
|
| 56 |
+
words_in_line = line.split()
|
| 57 |
+
word_width = (x2 - x1) / max(len(words_in_line), 1)
|
| 58 |
+
|
| 59 |
+
ocr_words: List[OCRWord] = []
|
| 60 |
+
for w_idx, word in enumerate(words_in_line):
|
| 61 |
+
wx1 = x1 + w_idx * word_width
|
| 62 |
+
wx2 = wx1 + word_width
|
| 63 |
+
ocr_words.append(
|
| 64 |
+
OCRWord(
|
| 65 |
+
text=word,
|
| 66 |
+
box=[wx1, y1, wx2, y2],
|
| 67 |
+
confidence=0.90,
|
| 68 |
+
)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
blocks.append(
|
| 72 |
+
OCRBlock(
|
| 73 |
+
text=line,
|
| 74 |
+
words=ocr_words,
|
| 75 |
+
box=[x1, y1, x2, y2],
|
| 76 |
+
confidence=0.90,
|
| 77 |
+
reading_order=idx,
|
| 78 |
+
)
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
return blocks
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def run_ocr(image_path: str, language_hint: Optional[str] = None) -> List[OCRBlock]:
|
| 85 |
+
"""Send an image to the HF OCR space and return structured blocks.
|
| 86 |
+
|
| 87 |
+
Parameters
|
| 88 |
+
----------
|
| 89 |
+
image_path : str
|
| 90 |
+
Path to the image file on disk.
|
| 91 |
+
language_hint : str | None
|
| 92 |
+
Comma-separated language codes (unused by space but kept for API).
|
| 93 |
+
|
| 94 |
+
Returns
|
| 95 |
+
-------
|
| 96 |
+
list[OCRBlock]
|
| 97 |
+
Parsed OCR blocks with word-level data.
|
| 98 |
+
|
| 99 |
+
Raises
|
| 100 |
+
------
|
| 101 |
+
RuntimeError
|
| 102 |
+
When the OCR service is completely unreachable (HTTP 503 equivalent).
|
| 103 |
+
"""
|
| 104 |
+
from PIL import Image
|
| 105 |
+
|
| 106 |
+
img = Image.open(image_path)
|
| 107 |
+
img_width, img_height = img.size
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
client = Client(HF_OCR_SPACE)
|
| 111 |
+
result = client.predict(
|
| 112 |
+
handle_file(image_path),
|
| 113 |
+
api_name="/predict",
|
| 114 |
+
)
|
| 115 |
+
except Exception as exc:
|
| 116 |
+
logger.error("OCR space call failed: %s", exc)
|
| 117 |
+
raise RuntimeError(f"OCR service unavailable: {exc}") from exc
|
| 118 |
+
|
| 119 |
+
raw_text = str(result) if result else ""
|
| 120 |
+
blocks = _parse_ocr_response(raw_text, img_width, img_height)
|
| 121 |
+
if not blocks:
|
| 122 |
+
logger.warning("OCR returned no text for %s", image_path)
|
| 123 |
+
return blocks
|
pipeline/typography.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typography and geometry extraction from OCR results and image data."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
from app.models import (
|
| 11 |
+
BoundingBox,
|
| 12 |
+
CharacterInfo,
|
| 13 |
+
FontMetrics,
|
| 14 |
+
Geometry,
|
| 15 |
+
Rendering,
|
| 16 |
+
)
|
| 17 |
+
from app.pipeline.ocr import OCRBlock
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _dominant_color(image: Image.Image, box: List[float]) -> str:
|
| 21 |
+
"""Return the dominant (most common) color in the region as a hex string."""
|
| 22 |
+
x1, y1, x2, y2 = [int(v) for v in box]
|
| 23 |
+
x1, y1 = max(x1, 0), max(y1, 0)
|
| 24 |
+
x2 = min(x2, image.width)
|
| 25 |
+
y2 = min(y2, image.height)
|
| 26 |
+
if x2 <= x1 or y2 <= y1:
|
| 27 |
+
return "#000000"
|
| 28 |
+
|
| 29 |
+
crop = image.crop((x1, y1, x2, y2)).convert("RGB")
|
| 30 |
+
arr = np.array(crop).reshape(-1, 3)
|
| 31 |
+
# simple approach: find darkest color cluster (text is usually dark)
|
| 32 |
+
dark_mask = arr.sum(axis=1) < 384 # rough threshold
|
| 33 |
+
if dark_mask.any():
|
| 34 |
+
mean_col = arr[dark_mask].mean(axis=0).astype(int)
|
| 35 |
+
else:
|
| 36 |
+
mean_col = arr.mean(axis=0).astype(int)
|
| 37 |
+
|
| 38 |
+
return "#{:02x}{:02x}{:02x}".format(*mean_col)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def extract_geometry(block: OCRBlock, img_width: int, img_height: int) -> Geometry:
|
| 42 |
+
x1, y1, x2, y2 = block.box
|
| 43 |
+
width = x2 - x1
|
| 44 |
+
height = y2 - y1
|
| 45 |
+
baseline_y = y1 + height * 0.85
|
| 46 |
+
|
| 47 |
+
return Geometry(
|
| 48 |
+
bounding_box=BoundingBox(x=x1, y=y1, width=width, height=height),
|
| 49 |
+
baseline=[x1, baseline_y, x2, baseline_y],
|
| 50 |
+
rotation=0.0,
|
| 51 |
+
alignment="left",
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def estimate_font_metrics(font_size_px: float) -> FontMetrics:
|
| 56 |
+
"""Estimate standard font metrics from the font size."""
|
| 57 |
+
return FontMetrics(
|
| 58 |
+
ascender_px=round(font_size_px * 0.8, 2),
|
| 59 |
+
descender_px=round(-font_size_px * 0.2, 2),
|
| 60 |
+
cap_height_px=round(font_size_px * 0.7, 2),
|
| 61 |
+
x_height_px=round(font_size_px * 0.48, 2),
|
| 62 |
+
units_per_em=1000,
|
| 63 |
+
scale_factor=1.0,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def extract_rendering(
|
| 68 |
+
block: OCRBlock, image: Image.Image
|
| 69 |
+
) -> Tuple[Rendering, float]:
|
| 70 |
+
"""Compute rendering attributes; returns (Rendering, font_size_px)."""
|
| 71 |
+
x1, y1, x2, y2 = block.box
|
| 72 |
+
height = y2 - y1
|
| 73 |
+
font_size_px = round(height * 0.75, 2) if height > 0 else 12.0
|
| 74 |
+
line_height_px = round(height, 2)
|
| 75 |
+
|
| 76 |
+
text = block.text
|
| 77 |
+
n_chars = max(len(text), 1)
|
| 78 |
+
width = x2 - x1
|
| 79 |
+
|
| 80 |
+
letter_spacing = round((width / n_chars) - font_size_px * 0.6, 2)
|
| 81 |
+
if letter_spacing < 0:
|
| 82 |
+
letter_spacing = 0.0
|
| 83 |
+
|
| 84 |
+
words = text.split()
|
| 85 |
+
n_spaces = max(len(words) - 1, 1)
|
| 86 |
+
total_char_width = n_chars * font_size_px * 0.6
|
| 87 |
+
word_spacing = round((width - total_char_width) / n_spaces, 2)
|
| 88 |
+
if word_spacing < 0:
|
| 89 |
+
word_spacing = round(font_size_px * 0.25, 2)
|
| 90 |
+
|
| 91 |
+
fill_color = _dominant_color(image, block.box)
|
| 92 |
+
|
| 93 |
+
rendering = Rendering(
|
| 94 |
+
font_size_px=font_size_px,
|
| 95 |
+
line_height_px=line_height_px,
|
| 96 |
+
letter_spacing_px=letter_spacing,
|
| 97 |
+
word_spacing_px=word_spacing,
|
| 98 |
+
fill_color=fill_color,
|
| 99 |
+
antialiasing="grayscale",
|
| 100 |
+
hinting="none",
|
| 101 |
+
)
|
| 102 |
+
return rendering, font_size_px
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def extract_characters(
|
| 106 |
+
block: OCRBlock, geometry: Geometry, font_size_px: float
|
| 107 |
+
) -> List[CharacterInfo]:
|
| 108 |
+
"""Generate per-character bounding boxes spread across the block."""
|
| 109 |
+
text = block.text
|
| 110 |
+
if not text:
|
| 111 |
+
return []
|
| 112 |
+
|
| 113 |
+
bb = geometry.bounding_box
|
| 114 |
+
x_start = bb.x
|
| 115 |
+
y_start = bb.y
|
| 116 |
+
total_width = bb.width
|
| 117 |
+
height = bb.height
|
| 118 |
+
|
| 119 |
+
advance = total_width / max(len(text), 1)
|
| 120 |
+
|
| 121 |
+
chars: List[CharacterInfo] = []
|
| 122 |
+
for i, ch in enumerate(text):
|
| 123 |
+
cx1 = round(x_start + i * advance, 2)
|
| 124 |
+
cy1 = round(y_start, 2)
|
| 125 |
+
cx2 = round(cx1 + advance, 2)
|
| 126 |
+
cy2 = round(y_start + height, 2)
|
| 127 |
+
chars.append(
|
| 128 |
+
CharacterInfo(
|
| 129 |
+
char=ch,
|
| 130 |
+
box=[cx1, cy1, cx2, cy2],
|
| 131 |
+
advance_width=round(advance, 2),
|
| 132 |
+
baseline_offset=0.0,
|
| 133 |
+
)
|
| 134 |
+
)
|
| 135 |
+
return chars
|