Spaces:

nagpalsumit247
/

fast-mult

Sleeping

App Files Files Community

nagpalsumit247 commited on Feb 14

Commit

f1880d7

verified ·

1 Parent(s): 6f1ce2d

Upload 7 files

Browse files

Files changed (7) hide show

__init__.py +0 -0
main.py +188 -0
models.py +106 -0
pipeline/__init__.py +0 -0
pipeline/font_id.py +134 -0
pipeline/ocr.py +123 -0
pipeline/typography.py +135 -0

__init__.py ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""FastAPI application – image analysis endpoint."""
+from __future__ import annotations
+import logging
+import tempfile
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from PIL import Image
+from app.models import (
+    AnalysisResponse,
+    FontAlternative,
+    FontInfo,
+    FontSources,
+    ImageMetadata,
+    Reconstruction,
+    TextBlock,
+)
+from app.pipeline.font_id import identify_font
+from app.pipeline.ocr import run_ocr
+from app.pipeline.typography import (
+    estimate_font_metrics,
+    extract_characters,
+    extract_geometry,
+    extract_rendering,
+)
+logger = logging.getLogger(__name__)
+ALLOWED_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp"}
+app = FastAPI(
+    title="Image Analysis API",
+    description="Analyzes images and returns JSON for near-pixel-perfect reconstruction.",
+    version="1.0.0",
+)
+@app.get("/")
+async def root():
+    return {"status": "ok", "message": "Image Analysis API is running."}
+@app.post("/analyze/image", response_model=AnalysisResponse)
+async def analyze_image(
+    image: UploadFile = File(...),
+    dpi: Optional[int] = Form(None),
+    language_hint: Optional[str] = Form(None),
+    output_units: Optional[str] = Form("px"),
+    preserve_whitespace: Optional[bool] = Form(True),
+):
+    """Analyze an input image and return structured JSON for reconstruction.
+    Pipeline:
+    1. OCR text detection & recognition
+    2. Font identification on OCR-detected regions
+    3. Typography & geometry extraction
+    """
+    analysis_warnings: list[str] = []
+    # --- Validate file extension ---
+    filename = image.filename or ""
+    ext = Path(filename).suffix.lower()
+    if ext not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported image format '{ext}'. Allowed: {', '.join(sorted(ALLOWED_EXTENSIONS))}",
+        )
+    # --- Save upload to temp file ---
+    contents = await image.read()
+    tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
+    tmp.write(contents)
+    tmp.close()
+    tmp_path = tmp.name
+    try:
+        img = Image.open(tmp_path)
+        img_width, img_height = img.size
+        color_mode = img.mode  # RGB, RGBA, L, etc.
+        if color_mode == "L":
+            color_mode = "GRAY"
+        detected_dpi = dpi
+        if detected_dpi is None:
+            info = img.info
+            if "dpi" in info:
+                detected_dpi = int(info["dpi"][0])
+            else:
+                detected_dpi = 72
+        image_meta = ImageMetadata(
+            width=img_width,
+            height=img_height,
+            dpi=detected_dpi,
+            color_mode=color_mode,
+        )
+        # --- Step 1: OCR ---
+        try:
+            ocr_blocks = run_ocr(tmp_path, language_hint=language_hint)
+        except RuntimeError:
+            raise HTTPException(status_code=503, detail="OCR service unavailable")
+        if not ocr_blocks:
+            analysis_warnings.append("OCR returned no text blocks")
+            return AnalysisResponse(
+                image_metadata=image_meta,
+                blocks=[],
+                warnings=analysis_warnings,
+            )
+        # --- Steps 2 & 3: Font ID + Typography ---
+        blocks: list[TextBlock] = []
+        for idx, ocr_block in enumerate(ocr_blocks):
+            block_id = f"block_{idx + 1:03d}"
+            # Geometry
+            geometry = extract_geometry(ocr_block, img_width, img_height)
+            # Font identification on the cropped region
+            font_result = identify_font(img, ocr_block.box)
+            # Typography / rendering
+            rendering, font_size_px = extract_rendering(ocr_block, img)
+            # Font metrics
+            metrics = estimate_font_metrics(font_size_px)
+            font_info = FontInfo(
+                primary=font_result.primary,
+                confidence=font_result.confidence,
+                alternatives=[
+                    FontAlternative(name=a.name, confidence=a.confidence)
+                    for a in font_result.alternatives
+                ],
+                category=font_result.category,
+                metrics=metrics,
+            )
+            if font_result.uncertain:
+                analysis_warnings.append(
+                    f"Font identification uncertain for {block_id}"
+                )
+            # Characters
+            characters = extract_characters(ocr_block, geometry, font_size_px)
+            if not preserve_whitespace:
+                text = " ".join(ocr_block.text.split())
+            else:
+                text = ocr_block.text
+            blocks.append(
+                TextBlock(
+                    id=block_id,
+                    text=text,
+                    language=ocr_block.language,
+                    confidence=ocr_block.confidence,
+                    reading_order=ocr_block.reading_order,
+                    geometry=geometry,
+                    font=font_info,
+                    rendering=rendering,
+                    characters=characters,
+                )
+            )
+        return AnalysisResponse(
+            image_metadata=image_meta,
+            blocks=blocks,
+            font_sources=FontSources(
+                strategy="fallback",
+                notes="Embed font when possible to ensure rendering parity",
+            ),
+            reconstruction=Reconstruction(),
+            warnings=analysis_warnings,
+        )
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception("Unexpected error during analysis")
+        raise HTTPException(status_code=500, detail=str(exc))
+    finally:
+        Path(tmp_path).unlink(missing_ok=True)

models.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""Pydantic models for the image analysis API request and response."""
+from __future__ import annotations
+from typing import List, Optional
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Response models
+# ---------------------------------------------------------------------------
+class BoundingBox(BaseModel):
+    x: float
+    y: float
+    width: float
+    height: float
+class Geometry(BaseModel):
+    bounding_box: BoundingBox
+    baseline: List[float] = Field(
+        ..., description="[x1, y1, x2, y2] baseline coordinates"
+    )
+    rotation: float = 0.0
+    alignment: str = "left"
+class FontAlternative(BaseModel):
+    name: str
+    confidence: float
+class FontMetrics(BaseModel):
+    ascender_px: float
+    descender_px: float
+    cap_height_px: float
+    x_height_px: float
+    units_per_em: int = 1000
+    scale_factor: float = 1.0
+class FontInfo(BaseModel):
+    primary: str
+    confidence: float
+    alternatives: List[FontAlternative] = []
+    category: Optional[str] = None
+    metrics: FontMetrics
+class Rendering(BaseModel):
+    font_size_px: float
+    line_height_px: float
+    letter_spacing_px: float
+    word_spacing_px: float
+    fill_color: str = "#000000"
+    antialiasing: str = "grayscale"
+    hinting: str = "none"
+class CharacterInfo(BaseModel):
+    char: str
+    box: List[float] = Field(
+        ..., description="[x1, y1, x2, y2] bounding box"
+    )
+    advance_width: float
+    baseline_offset: float = 0.0
+class TextBlock(BaseModel):
+    id: str
+    text: str
+    language: str = "en"
+    confidence: float = 0.0
+    reading_order: int = 0
+    geometry: Geometry
+    font: FontInfo
+    rendering: Rendering
+    characters: List[CharacterInfo] = []
+class ImageMetadata(BaseModel):
+    width: int
+    height: int
+    dpi: int = 72
+    color_mode: str = "RGB"
+class FontSources(BaseModel):
+    strategy: str = "fallback"
+    notes: str = "Embed font when possible to ensure rendering parity"
+class Reconstruction(BaseModel):
+    guarantee: str = "near-pixel-perfect"
+    supported_renderers: List[str] = ["canvas", "svg", "pdf", "html"]
+class AnalysisResponse(BaseModel):
+    image_metadata: ImageMetadata
+    blocks: List[TextBlock] = []
+    font_sources: FontSources = FontSources()
+    reconstruction: Reconstruction = Reconstruction()
+    warnings: List[str] = []

pipeline/__init__.py ADDED Viewed

File without changes

pipeline/font_id.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Font identification using Hugging Face font-identifier model."""
+from __future__ import annotations
+import io
+import logging
+from dataclasses import dataclass, field
+from typing import List, Optional
+from PIL import Image
+logger = logging.getLogger(__name__)
+HF_FONT_MODEL = "gaborcselle/font-identifier"
+FONT_CATEGORIES = {
+    "arial": "sans",
+    "helvetica": "sans",
+    "verdana": "sans",
+    "tahoma": "sans",
+    "calibri": "sans",
+    "roboto": "sans",
+    "open sans": "sans",
+    "times": "serif",
+    "times new roman": "serif",
+    "georgia": "serif",
+    "garamond": "serif",
+    "palatino": "serif",
+    "courier": "mono",
+    "courier new": "mono",
+    "consolas": "mono",
+    "monaco": "mono",
+    "comic sans": "display",
+    "impact": "display",
+    "papyrus": "handwritten",
+}
+@dataclass
+class FontCandidate:
+    name: str
+    confidence: float
+@dataclass
+class FontResult:
+    primary: str = "unknown"
+    confidence: float = 0.0
+    alternatives: List[FontCandidate] = field(default_factory=list)
+    category: Optional[str] = None
+    uncertain: bool = False
+def _categorize(font_name: str) -> Optional[str]:
+    lower = font_name.lower()
+    for key, cat in FONT_CATEGORIES.items():
+        if key in lower:
+            return cat
+    return None
+def identify_font(image: Image.Image, box: List[float]) -> FontResult:
+    """Crop the image to *box* and identify the font via the HF model.
+    Parameters
+    ----------
+    image : PIL.Image.Image
+        Full original image.
+    box : list[float]
+        [x1, y1, x2, y2] bounding box of the text region.
+    Returns
+    -------
+    FontResult
+        Identified font with confidence and alternatives.
+    """
+    x1, y1, x2, y2 = box
+    crop = image.crop((int(x1), int(y1), int(x2), int(y2)))
+    if crop.width < 2 or crop.height < 2:
+        return FontResult(primary="unknown", confidence=0.0, uncertain=True)
+    try:
+        from gradio_client import Client, handle_file
+        buf = io.BytesIO()
+        crop.save(buf, format="PNG")
+        buf.seek(0)
+        import os
+        import tempfile
+        tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+        tmp.write(buf.getvalue())
+        tmp.close()
+        try:
+            client = Client(HF_FONT_MODEL)
+            result = client.predict(handle_file(tmp.name), api_name="/predict")
+        finally:
+            os.unlink(tmp.name)
+        if isinstance(result, dict) and "label" in result:
+            label = result["label"]
+            conf = float(result.get("confidences", [{}])[0].get("confidence", 0.0))
+            alternatives = []
+            for alt in result.get("confidences", [])[1:4]:
+                alternatives.append(
+                    FontCandidate(
+                        name=alt.get("label", "unknown"),
+                        confidence=float(alt.get("confidence", 0.0)),
+                    )
+                )
+            return FontResult(
+                primary=label,
+                confidence=conf,
+                alternatives=alternatives,
+                category=_categorize(label),
+                uncertain=conf < 0.5,
+            )
+        if isinstance(result, str):
+            return FontResult(
+                primary=result.strip(),
+                confidence=0.5,
+                category=_categorize(result.strip()),
+                uncertain=True,
+            )
+        return FontResult(primary="unknown", confidence=0.0, uncertain=True)
+    except Exception as exc:
+        logger.warning("Font identification failed: %s", exc)
+        return FontResult(primary="unknown", confidence=0.0, uncertain=True)

pipeline/ocr.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""OCR pipeline using Hugging Face Image-to-Multilingual-OCR space."""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+from gradio_client import Client, handle_file
+logger = logging.getLogger(__name__)
+HF_OCR_SPACE = "awacke1/Image-to-Multilingual-OCR"
+@dataclass
+class OCRWord:
+    text: str
+    box: List[float]  # [x1, y1, x2, y2]
+    confidence: float = 0.0
+    language: str = "en"
+@dataclass
+class OCRBlock:
+    text: str
+    words: List[OCRWord] = field(default_factory=list)
+    box: List[float] = field(default_factory=lambda: [0, 0, 0, 0])
+    confidence: float = 0.0
+    language: str = "en"
+    reading_order: int = 0
+def _parse_ocr_response(raw_result: str, img_width: int, img_height: int) -> List[OCRBlock]:
+    """Parse the raw text output from the OCR space into structured blocks.
+    The OCR space returns detected text.  We parse lines and synthesise
+    bounding boxes spread evenly across the image when per-word coordinates
+    are not directly available from the API.
+    """
+    if not raw_result or not raw_result.strip():
+        return []
+    lines = [l for l in raw_result.strip().splitlines() if l.strip()]
+    blocks: List[OCRBlock] = []
+    line_height = img_height / max(len(lines), 1)
+    for idx, line in enumerate(lines):
+        y1 = idx * line_height
+        y2 = y1 + line_height
+        x1 = 0.0
+        x2 = float(img_width)
+        words_in_line = line.split()
+        word_width = (x2 - x1) / max(len(words_in_line), 1)
+        ocr_words: List[OCRWord] = []
+        for w_idx, word in enumerate(words_in_line):
+            wx1 = x1 + w_idx * word_width
+            wx2 = wx1 + word_width
+            ocr_words.append(
+                OCRWord(
+                    text=word,
+                    box=[wx1, y1, wx2, y2],
+                    confidence=0.90,
+                )
+            )
+        blocks.append(
+            OCRBlock(
+                text=line,
+                words=ocr_words,
+                box=[x1, y1, x2, y2],
+                confidence=0.90,
+                reading_order=idx,
+            )
+        )
+    return blocks
+def run_ocr(image_path: str, language_hint: Optional[str] = None) -> List[OCRBlock]:
+    """Send an image to the HF OCR space and return structured blocks.
+    Parameters
+    ----------
+    image_path : str
+        Path to the image file on disk.
+    language_hint : str | None
+        Comma-separated language codes (unused by space but kept for API).
+    Returns
+    -------
+    list[OCRBlock]
+        Parsed OCR blocks with word-level data.
+    Raises
+    ------
+    RuntimeError
+        When the OCR service is completely unreachable (HTTP 503 equivalent).
+    """
+    from PIL import Image
+    img = Image.open(image_path)
+    img_width, img_height = img.size
+    try:
+        client = Client(HF_OCR_SPACE)
+        result = client.predict(
+            handle_file(image_path),
+            api_name="/predict",
+        )
+    except Exception as exc:
+        logger.error("OCR space call failed: %s", exc)
+        raise RuntimeError(f"OCR service unavailable: {exc}") from exc
+    raw_text = str(result) if result else ""
+    blocks = _parse_ocr_response(raw_text, img_width, img_height)
+    if not blocks:
+        logger.warning("OCR returned no text for %s", image_path)
+    return blocks

pipeline/typography.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Typography and geometry extraction from OCR results and image data."""
+from __future__ import annotations
+from typing import List, Tuple
+import numpy as np
+from PIL import Image
+from app.models import (
+    BoundingBox,
+    CharacterInfo,
+    FontMetrics,
+    Geometry,
+    Rendering,
+)
+from app.pipeline.ocr import OCRBlock
+def _dominant_color(image: Image.Image, box: List[float]) -> str:
+    """Return the dominant (most common) color in the region as a hex string."""
+    x1, y1, x2, y2 = [int(v) for v in box]
+    x1, y1 = max(x1, 0), max(y1, 0)
+    x2 = min(x2, image.width)
+    y2 = min(y2, image.height)
+    if x2 <= x1 or y2 <= y1:
+        return "#000000"
+    crop = image.crop((x1, y1, x2, y2)).convert("RGB")
+    arr = np.array(crop).reshape(-1, 3)
+    # simple approach: find darkest color cluster (text is usually dark)
+    dark_mask = arr.sum(axis=1) < 384  # rough threshold
+    if dark_mask.any():
+        mean_col = arr[dark_mask].mean(axis=0).astype(int)
+    else:
+        mean_col = arr.mean(axis=0).astype(int)
+    return "#{:02x}{:02x}{:02x}".format(*mean_col)
+def extract_geometry(block: OCRBlock, img_width: int, img_height: int) -> Geometry:
+    x1, y1, x2, y2 = block.box
+    width = x2 - x1
+    height = y2 - y1
+    baseline_y = y1 + height * 0.85
+    return Geometry(
+        bounding_box=BoundingBox(x=x1, y=y1, width=width, height=height),
+        baseline=[x1, baseline_y, x2, baseline_y],
+        rotation=0.0,
+        alignment="left",
+    )
+def estimate_font_metrics(font_size_px: float) -> FontMetrics:
+    """Estimate standard font metrics from the font size."""
+    return FontMetrics(
+        ascender_px=round(font_size_px * 0.8, 2),
+        descender_px=round(-font_size_px * 0.2, 2),
+        cap_height_px=round(font_size_px * 0.7, 2),
+        x_height_px=round(font_size_px * 0.48, 2),
+        units_per_em=1000,
+        scale_factor=1.0,
+    )
+def extract_rendering(
+    block: OCRBlock, image: Image.Image
+) -> Tuple[Rendering, float]:
+    """Compute rendering attributes; returns (Rendering, font_size_px)."""
+    x1, y1, x2, y2 = block.box
+    height = y2 - y1
+    font_size_px = round(height * 0.75, 2) if height > 0 else 12.0
+    line_height_px = round(height, 2)
+    text = block.text
+    n_chars = max(len(text), 1)
+    width = x2 - x1
+    letter_spacing = round((width / n_chars) - font_size_px * 0.6, 2)
+    if letter_spacing < 0:
+        letter_spacing = 0.0
+    words = text.split()
+    n_spaces = max(len(words) - 1, 1)
+    total_char_width = n_chars * font_size_px * 0.6
+    word_spacing = round((width - total_char_width) / n_spaces, 2)
+    if word_spacing < 0:
+        word_spacing = round(font_size_px * 0.25, 2)
+    fill_color = _dominant_color(image, block.box)
+    rendering = Rendering(
+        font_size_px=font_size_px,
+        line_height_px=line_height_px,
+        letter_spacing_px=letter_spacing,
+        word_spacing_px=word_spacing,
+        fill_color=fill_color,
+        antialiasing="grayscale",
+        hinting="none",
+    )
+    return rendering, font_size_px
+def extract_characters(
+    block: OCRBlock, geometry: Geometry, font_size_px: float
+) -> List[CharacterInfo]:
+    """Generate per-character bounding boxes spread across the block."""
+    text = block.text
+    if not text:
+        return []
+    bb = geometry.bounding_box
+    x_start = bb.x
+    y_start = bb.y
+    total_width = bb.width
+    height = bb.height
+    advance = total_width / max(len(text), 1)
+    chars: List[CharacterInfo] = []
+    for i, ch in enumerate(text):
+        cx1 = round(x_start + i * advance, 2)
+        cy1 = round(y_start, 2)
+        cx2 = round(cx1 + advance, 2)
+        cy2 = round(y_start + height, 2)
+        chars.append(
+            CharacterInfo(
+                char=ch,
+                box=[cx1, cy1, cx2, cy2],
+                advance_width=round(advance, 2),
+                baseline_offset=0.0,
+            )
+        )
+    return chars