Spaces:

DocUA
/

LightOnOCR-1B-Demo

Sleeping

App Files Files Community

DocUA commited on Jan 6

Commit

a25a813

1 Parent(s): ff2c62c

Unified project structure: app_space.py for ZeroGPU, root README metadata

Browse files

Files changed (8) hide show

README.md +15 -3
hf_space/app.py → app_space.py +0 -0
hf_space/README.md +0 -27
hf_space/backends/__init__.py +0 -78
hf_space/backends/gguf_backend.py +0 -138
hf_space/backends/pytorch_backend.py +0 -136
hf_space/requirements.txt +0 -10
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,6 +1,18 @@
 # LightOnOCR-1B Demo
-High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon.
 ## 🚀 Performance
 - **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
@@ -8,12 +20,12 @@ High-performance OCR application using LightOnOCR-1B model, optimized for Apple
 ## Features
 - 📄 PDF and image support
-- 🔄 Seamless switching between GGUF and PyTorch backends
 - 🎛️ Configurable resolution (scale) and token generation
 - 🖥️ CLI and Gradio web interface
 - 🍎 Full Metal/MPS support
-## Quick Start
 ### 1. Prerequisites
 - Python 3.10+

+---
+title: LightOnOCR 1B Demo
+emoji: 📖
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 5.42.0
+app_file: app_space.py
+pinned: false
+license: other
+---
 # LightOnOCR-1B Demo
+High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon and ZeroGPU.
 ## 🚀 Performance
 - **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
 ## Features
 - 📄 PDF and image support
+- 🔄 Seamless switching between GGUF and PyTorch backends (Local)
 - 🎛️ Configurable resolution (scale) and token generation
 - 🖥️ CLI and Gradio web interface
 - 🍎 Full Metal/MPS support
+## Quick Start (Local)
 ### 1. Prerequisites
 - Python 3.10+

hf_space/app.py → app_space.py RENAMED Viewed

File without changes

hf_space/README.md DELETED Viewed

@@ -1,27 +0,0 @@
----
-title: LightOnOCR 1B Demo
-emoji: 📖
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
-license: other
----
-# 📖 LightOnOCR-1B Demo
-A high-performance OCR demo using the **LightOnOCR-1B** model.
-This demo uses the PyTorch backend optimized for accuracy.
-## Features
-- **PDF & Image Input:** Upload multi-page PDFs or single images.
-- **Configurable Generation:** Adjust temperature and max tokens.
-- **ZeroGPU Support:** Runs efficiently on Hugging Face ZeroGPU infrastructure.
-## Model
-Uses [lightonai/LightOnOCR-1B-1025](https://huggingface.co/lightonai/LightOnOCR-1B-1025).
-## Local Development
-To run this locally with maximum performance (including GGUF support for Apple Silicon), verify the GitHub repository.

hf_space/backends/__init__.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""
-Backend interface for LightOnOCR-1B inference.
-Supports both PyTorch and GGUF backends.
-"""
-from abc import ABC, abstractmethod
-from typing import List, Tuple
-from PIL import Image
-class OCRBackend(ABC):
-    """Abstract base class for OCR backends."""
-    @abstractmethod
-    def load_model(self):
-        """Load the OCR model."""
-        pass
-    @abstractmethod
-    def process_image(self, image: Image.Image, temperature: float = 0.1) -> str:
-        """
-        Process a single image and return extracted text.
-        Args:
-            image: PIL Image to process
-            temperature: Sampling temperature (0 = greedy)
-        Returns:
-            Extracted text as string
-        """
-        pass
-    @abstractmethod
-    def get_backend_info(self) -> dict:
-        """Return backend information (name, device, memory usage, etc.)."""
-        pass
-def get_available_backends() -> List[str]:
-    """Return list of available backend names."""
-    backends = ["pytorch"]
-    # Check for GGUF support (binary or python package)
-    from pathlib import Path
-    project_root = Path(__file__).parent.parent
-    cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
-    if cli_path.exists():
-        backends.append("gguf")
-    else:
-        # Fallback check for python package (though we prefer CLI now)
-        try:
-            import llama_cpp
-            backends.append("gguf")
-        except ImportError:
-            pass
-    return backends
-def create_backend(backend_name: str) -> OCRBackend:
-    """
-    Factory function to create backend instance.
-    Args:
-        backend_name: "pytorch" or "gguf"
-    Returns:
-        OCRBackend instance
-    """
-    if backend_name == "pytorch":
-        from .pytorch_backend import PyTorchBackend
-        return PyTorchBackend()
-    elif backend_name == "gguf":
-        from .gguf_backend import GGUFBackend
-        return GGUFBackend()
-    else:
-        raise ValueError(f"Unknown backend: {backend_name}. Available: {get_available_backends()}")

hf_space/backends/gguf_backend.py DELETED Viewed

@@ -1,138 +0,0 @@
-"""
-GGUF backend for LightOnOCR-1B using local llama-mtmd-cli binary.
-"""
-import os
-import io
-import tempfile
-import subprocess
-from pathlib import Path
-from PIL import Image
-from typing import Optional
-from . import OCRBackend
-class GGUFBackend(OCRBackend):
-    """GGUF-based OCR backend using local llama-mtmd-cli binary."""
-    def __init__(self, model_path: Optional[str] = None, mmproj_path: Optional[str] = None):
-        """
-        Initialize GGUF backend.
-        Args:
-            model_path: Path to GGUF model file
-            mmproj_path: Path to mmproj file
-        """
-        self.model_path = model_path
-        self.mmproj_path = mmproj_path
-        self.cli_path = self._find_cli_binary()
-        self._auto_detect_files()
-    def _find_cli_binary(self) -> Optional[str]:
-        """Find the llama-mtmd-cli binary."""
-        # Check project root llama.cpp build
-        project_root = Path(__file__).parent.parent
-        cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
-        if cli_path.exists():
-            return str(cli_path)
-        return None
-    def _auto_detect_files(self):
-        """Try to find GGUF model and mmproj files."""
-        if self.model_path and Path(self.model_path).exists():
-            if not self.mmproj_path:
-                model_dir = Path(self.model_path).parent
-                for mmproj_file in model_dir.glob("*mmproj*.gguf"):
-                    self.mmproj_path = str(mmproj_file)
-                    print(f"Auto-detected mmproj: {self.mmproj_path}")
-                    break
-            return
-        search_paths = [
-            Path.cwd() / "models",
-            Path.cwd() / "gguf_models",
-        ]
-        for search_path in search_paths:
-            if not search_path.exists():
-                continue
-            for gguf_file in search_path.rglob("*.gguf"):
-                if "lightonocr" in gguf_file.name.lower() and "mmproj" not in gguf_file.name.lower():
-                    self.model_path = str(gguf_file)
-                    print(f"Auto-detected model: {self.model_path}")
-                    model_dir = gguf_file.parent
-                    for mmproj_file in model_dir.glob("*mmproj*.gguf"):
-                        self.mmproj_path = str(mmproj_file)
-                        print(f"Auto-detected mmproj: {self.mmproj_path}")
-                        break
-                    break
-            if self.model_path:
-                break
-    def load_model(self):
-        """Verify model, mmproj and CLI binary exist."""
-        if not self.cli_path:
-            raise RuntimeError(
-                "llama-mtmd-cli binary not found.\n"
-                "Please build llama.cpp locally:\n"
-                "  git clone https://github.com/ggerganov/llama.cpp\n"
-                "  cd llama.cpp && mkdir build && cd build\n"
-                "  cmake .. -DGGML_METAL=ON && cmake --build . --config Release"
-            )
-        if not self.model_path or not Path(self.model_path).exists():
-            raise ValueError("GGUF model not found. Run download_gguf_model.py")
-        if not self.mmproj_path or not Path(self.mmproj_path).exists():
-            raise ValueError("mmproj file not found. Run download_gguf_model.py")
-        print(f"GGUF Backend ready:")
-        print(f"  CLI: {self.cli_path}")
-        print(f"  Model: {self.model_path}")
-        print(f"  Projector: {self.mmproj_path}")
-    def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
-        """Process image using llama-mtmd-cli."""
-        if not self.cli_path:
-            self.load_model()
-        # Save image to temp file
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
-            image.save(tmp_img.name)
-            tmp_img_path = tmp_img.name
-        try:
-            cmd = [
-                self.cli_path,
-                "-m", self.model_path,
-                "--mmproj", self.mmproj_path,
-                "--image", tmp_img_path,
-                "-p", "Extract all text from this image. Be precise and include all visible text.",
-                "--temp", str(temperature),
-                "--n-predict", str(max_tokens),
-                # "--log-disable" # Removed as it suppresses output
-            ]
-            # Run CLI
-            result = subprocess.run(cmd, capture_output=True, text=True)
-            if result.returncode != 0:
-                print(f"CLI Error: {result.stderr}")
-                raise RuntimeError(f"llama-mtmd-cli failed: {result.stderr}")
-            # stdout contains the generated text, stderr contains logs
-            return result.stdout.strip()
-        finally:
-            if os.path.exists(tmp_img_path):
-                os.unlink(tmp_img_path)
-    def get_backend_info(self) -> dict:
-        return {
-            "name": "GGUF (llama-mtmd-cli)",
-            "device": "Metal (via CLI)",
-            "model_path": self.model_path or "not found",
-            "mmproj_path": self.mmproj_path or "not found",
-            "cli_path": self.cli_path
-        }

hf_space/backends/pytorch_backend.py DELETED Viewed

@@ -1,136 +0,0 @@
-"""
-PyTorch backend for LightOnOCR-1B.
-Uses Mistral3ForConditionalGeneration with custom weight remapping.
-"""
-import torch
-import platform
-from pathlib import Path
-from PIL import Image
-from transformers import AutoConfig, PixtralProcessor, Mistral3ForConditionalGeneration
-from safetensors.torch import load_file
-from huggingface_hub import hf_hub_download
-from . import OCRBackend
-class PyTorchBackend(OCRBackend):
-    """PyTorch-based OCR backend using transformers."""
-    def __init__(self):
-        self.model = None
-        self.processor = None
-        self.device = None
-        self.dtype = None
-        self.model_id = "lightonai/LightOnOCR-1B-1025"
-    def load_model(self):
-        """Load the PyTorch model with custom weight remapping."""
-        if self.model is not None:
-            return  # Already loaded
-        print(f"Loading {self.model_id} (PyTorch backend)...")
-        # Load processor
-        self.processor = PixtralProcessor.from_pretrained(self.model_id, trust_remote_code=True)
-        # Instantiate model with config
-        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
-        self.model = Mistral3ForConditionalGeneration(config)
-        # Download and remap weights
-        print("  Downloading and remapping weights...")
-        weights_path = hf_hub_download(repo_id=self.model_id, filename="model.safetensors")
-        state_dict = load_file(weights_path)
-        new_state_dict = {}
-        for k, v in state_dict.items():
-            new_key = k
-            if "vision_encoder" in k:
-                new_key = k.replace("vision_encoder", "vision_tower")
-            if "vision_projection" in k:
-                new_key = k.replace("vision_projection", "multi_modal_projector")
-            new_state_dict[new_key] = v
-        self.model.load_state_dict(new_state_dict, strict=False)
-        # Determine device
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        if platform.system() == "Darwin" and "arm" in platform.machine().lower():
-            self.device = "mps"
-        # MPS has issues with float16, use float32
-        if self.device == "mps":
-            self.dtype = torch.float32
-        else:
-            self.dtype = torch.float16 if self.device == "cuda" else torch.float32
-        self.model = self.model.to(device=self.device, dtype=self.dtype)
-        self.model.eval()
-        print(f"  Model loaded on {self.device} ({self.dtype})")
-    def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
-        """Process image using PyTorch model."""
-        if self.model is None:
-            self.load_model()
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": "Extract all text from this image. Be precise and include all visible text."}
-                ]
-            }
-        ]
-        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        inputs = self.processor(text=prompt, images=image, return_tensors="pt")
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        # Ensure pixel_values match model dtype (critical for MPS)
-        if 'pixel_values' in inputs:
-            inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
-        # Configure generation parameters (aggressive anti-repetition for HF Space)
-        do_sample = temperature > 0.0
-        gen_kwargs = {
-            "max_new_tokens": max_tokens,
-            "pad_token_id": self.processor.tokenizer.eos_token_id,
-            "eos_token_id": self.processor.tokenizer.eos_token_id,
-            "repetition_penalty": 1.5,  # Increased from 1.2
-            "early_stopping": True,
-        }
-        if do_sample:
-            gen_kwargs["temperature"] = temperature
-            gen_kwargs["do_sample"] = True
-        else:
-            gen_kwargs["do_sample"] = False
-        with torch.no_grad():
-            generated_ids = self.model.generate(**inputs, **gen_kwargs)
-        # CRITICAL: Decode only NEW tokens (skip input prompt)
-        input_len = inputs['input_ids'].shape[1]
-        new_tokens = generated_ids[:, input_len:]
-        generated_text = self.processor.batch_decode(new_tokens, skip_special_tokens=True)[0]
-        # Post-processing: Clean any remaining artifacts
-        # Remove prompt instruction if it leaked through
-        instruction = "Extract all text from this image. Be precise and include all visible text."
-        if instruction in generated_text:
-            generated_text = generated_text.split(instruction)[-1].strip()
-        return generated_text
-    def get_backend_info(self) -> dict:
-        """Return backend information."""
-        return {
-            "name": "PyTorch",
-            "device": str(self.device) if self.device else "not loaded",
-            "dtype": str(self.dtype) if self.dtype else "not loaded",
-            "model_id": self.model_id,
-            "loaded": self.model is not None
-        }

hf_space/requirements.txt DELETED Viewed

@@ -1,10 +0,0 @@
-gradio==5.42.0
-pillow>=10.3.0,<11
-pypdfium2==4.30.0
-# requests>=2.31.0,<3 # Already in base image usually, but good to keep
-huggingface_hub>=0.24.0
-torch>=2.0.0
-transformers>=4.36.0
-accelerate>=0.26.0
-safetensors>=0.4.0
-spaces==0.30.0

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ accelerate>=0.26.0
 safetensors>=0.4.0
 # llama-cpp-python is optional for GGUF backend support (or use local build)
 # llama-cpp-python>=0.3.0

 safetensors>=0.4.0
 # llama-cpp-python is optional for GGUF backend support (or use local build)
 # llama-cpp-python>=0.3.0
+spaces==0.30.0