DocUA commited on
Commit
a25a813
Β·
1 Parent(s): ff2c62c

Unified project structure: app_space.py for ZeroGPU, root README metadata

Browse files
README.md CHANGED
@@ -1,6 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  # LightOnOCR-1B Demo
2
 
3
- High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon.
4
 
5
  ## πŸš€ Performance
6
  - **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
@@ -8,12 +20,12 @@ High-performance OCR application using LightOnOCR-1B model, optimized for Apple
8
 
9
  ## Features
10
  - πŸ“„ PDF and image support
11
- - πŸ”„ Seamless switching between GGUF and PyTorch backends
12
  - πŸŽ›οΈ Configurable resolution (scale) and token generation
13
  - πŸ–₯️ CLI and Gradio web interface
14
  - 🍎 Full Metal/MPS support
15
 
16
- ## Quick Start
17
 
18
  ### 1. Prerequisites
19
  - Python 3.10+
 
1
+ ---
2
+ title: LightOnOCR 1B Demo
3
+ emoji: πŸ“–
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.42.0
8
+ app_file: app_space.py
9
+ pinned: false
10
+ license: other
11
+ ---
12
+
13
  # LightOnOCR-1B Demo
14
 
15
+ High-performance OCR application using LightOnOCR-1B model, optimized for Apple Silicon and ZeroGPU.
16
 
17
  ## πŸš€ Performance
18
  - **GGUF Backend:** ~3-4 seconds per page (M3 Max)!
 
20
 
21
  ## Features
22
  - πŸ“„ PDF and image support
23
+ - πŸ”„ Seamless switching between GGUF and PyTorch backends (Local)
24
  - πŸŽ›οΈ Configurable resolution (scale) and token generation
25
  - πŸ–₯️ CLI and Gradio web interface
26
  - 🍎 Full Metal/MPS support
27
 
28
+ ## Quick Start (Local)
29
 
30
  ### 1. Prerequisites
31
  - Python 3.10+
hf_space/app.py β†’ app_space.py RENAMED
File without changes
hf_space/README.md DELETED
@@ -1,27 +0,0 @@
1
- ---
2
- title: LightOnOCR 1B Demo
3
- emoji: πŸ“–
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- license: other
11
- ---
12
-
13
- # πŸ“– LightOnOCR-1B Demo
14
-
15
- A high-performance OCR demo using the **LightOnOCR-1B** model.
16
- This demo uses the PyTorch backend optimized for accuracy.
17
-
18
- ## Features
19
- - **PDF & Image Input:** Upload multi-page PDFs or single images.
20
- - **Configurable Generation:** Adjust temperature and max tokens.
21
- - **ZeroGPU Support:** Runs efficiently on Hugging Face ZeroGPU infrastructure.
22
-
23
- ## Model
24
- Uses [lightonai/LightOnOCR-1B-1025](https://huggingface.co/lightonai/LightOnOCR-1B-1025).
25
-
26
- ## Local Development
27
- To run this locally with maximum performance (including GGUF support for Apple Silicon), verify the GitHub repository.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space/backends/__init__.py DELETED
@@ -1,78 +0,0 @@
1
- """
2
- Backend interface for LightOnOCR-1B inference.
3
- Supports both PyTorch and GGUF backends.
4
- """
5
-
6
- from abc import ABC, abstractmethod
7
- from typing import List, Tuple
8
- from PIL import Image
9
-
10
-
11
- class OCRBackend(ABC):
12
- """Abstract base class for OCR backends."""
13
-
14
- @abstractmethod
15
- def load_model(self):
16
- """Load the OCR model."""
17
- pass
18
-
19
- @abstractmethod
20
- def process_image(self, image: Image.Image, temperature: float = 0.1) -> str:
21
- """
22
- Process a single image and return extracted text.
23
-
24
- Args:
25
- image: PIL Image to process
26
- temperature: Sampling temperature (0 = greedy)
27
-
28
- Returns:
29
- Extracted text as string
30
- """
31
- pass
32
-
33
- @abstractmethod
34
- def get_backend_info(self) -> dict:
35
- """Return backend information (name, device, memory usage, etc.)."""
36
- pass
37
-
38
-
39
- def get_available_backends() -> List[str]:
40
- """Return list of available backend names."""
41
- backends = ["pytorch"]
42
-
43
- # Check for GGUF support (binary or python package)
44
- from pathlib import Path
45
- project_root = Path(__file__).parent.parent
46
- cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
47
-
48
- if cli_path.exists():
49
- backends.append("gguf")
50
- else:
51
- # Fallback check for python package (though we prefer CLI now)
52
- try:
53
- import llama_cpp
54
- backends.append("gguf")
55
- except ImportError:
56
- pass
57
-
58
- return backends
59
-
60
-
61
- def create_backend(backend_name: str) -> OCRBackend:
62
- """
63
- Factory function to create backend instance.
64
-
65
- Args:
66
- backend_name: "pytorch" or "gguf"
67
-
68
- Returns:
69
- OCRBackend instance
70
- """
71
- if backend_name == "pytorch":
72
- from .pytorch_backend import PyTorchBackend
73
- return PyTorchBackend()
74
- elif backend_name == "gguf":
75
- from .gguf_backend import GGUFBackend
76
- return GGUFBackend()
77
- else:
78
- raise ValueError(f"Unknown backend: {backend_name}. Available: {get_available_backends()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space/backends/gguf_backend.py DELETED
@@ -1,138 +0,0 @@
1
- """
2
- GGUF backend for LightOnOCR-1B using local llama-mtmd-cli binary.
3
- """
4
-
5
- import os
6
- import io
7
- import tempfile
8
- import subprocess
9
- from pathlib import Path
10
- from PIL import Image
11
- from typing import Optional
12
-
13
- from . import OCRBackend
14
-
15
-
16
- class GGUFBackend(OCRBackend):
17
- """GGUF-based OCR backend using local llama-mtmd-cli binary."""
18
-
19
- def __init__(self, model_path: Optional[str] = None, mmproj_path: Optional[str] = None):
20
- """
21
- Initialize GGUF backend.
22
-
23
- Args:
24
- model_path: Path to GGUF model file
25
- mmproj_path: Path to mmproj file
26
- """
27
- self.model_path = model_path
28
- self.mmproj_path = mmproj_path
29
- self.cli_path = self._find_cli_binary()
30
- self._auto_detect_files()
31
-
32
- def _find_cli_binary(self) -> Optional[str]:
33
- """Find the llama-mtmd-cli binary."""
34
- # Check project root llama.cpp build
35
- project_root = Path(__file__).parent.parent
36
- cli_path = project_root / "llama.cpp" / "build" / "bin" / "llama-mtmd-cli"
37
- if cli_path.exists():
38
- return str(cli_path)
39
- return None
40
-
41
- def _auto_detect_files(self):
42
- """Try to find GGUF model and mmproj files."""
43
- if self.model_path and Path(self.model_path).exists():
44
- if not self.mmproj_path:
45
- model_dir = Path(self.model_path).parent
46
- for mmproj_file in model_dir.glob("*mmproj*.gguf"):
47
- self.mmproj_path = str(mmproj_file)
48
- print(f"Auto-detected mmproj: {self.mmproj_path}")
49
- break
50
- return
51
-
52
- search_paths = [
53
- Path.cwd() / "models",
54
- Path.cwd() / "gguf_models",
55
- ]
56
-
57
- for search_path in search_paths:
58
- if not search_path.exists():
59
- continue
60
- for gguf_file in search_path.rglob("*.gguf"):
61
- if "lightonocr" in gguf_file.name.lower() and "mmproj" not in gguf_file.name.lower():
62
- self.model_path = str(gguf_file)
63
- print(f"Auto-detected model: {self.model_path}")
64
- model_dir = gguf_file.parent
65
- for mmproj_file in model_dir.glob("*mmproj*.gguf"):
66
- self.mmproj_path = str(mmproj_file)
67
- print(f"Auto-detected mmproj: {self.mmproj_path}")
68
- break
69
- break
70
- if self.model_path:
71
- break
72
-
73
- def load_model(self):
74
- """Verify model, mmproj and CLI binary exist."""
75
- if not self.cli_path:
76
- raise RuntimeError(
77
- "llama-mtmd-cli binary not found.\n"
78
- "Please build llama.cpp locally:\n"
79
- " git clone https://github.com/ggerganov/llama.cpp\n"
80
- " cd llama.cpp && mkdir build && cd build\n"
81
- " cmake .. -DGGML_METAL=ON && cmake --build . --config Release"
82
- )
83
-
84
- if not self.model_path or not Path(self.model_path).exists():
85
- raise ValueError("GGUF model not found. Run download_gguf_model.py")
86
-
87
- if not self.mmproj_path or not Path(self.mmproj_path).exists():
88
- raise ValueError("mmproj file not found. Run download_gguf_model.py")
89
-
90
- print(f"GGUF Backend ready:")
91
- print(f" CLI: {self.cli_path}")
92
- print(f" Model: {self.model_path}")
93
- print(f" Projector: {self.mmproj_path}")
94
-
95
- def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
96
- """Process image using llama-mtmd-cli."""
97
- if not self.cli_path:
98
- self.load_model()
99
-
100
- # Save image to temp file
101
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
102
- image.save(tmp_img.name)
103
- tmp_img_path = tmp_img.name
104
-
105
- try:
106
- cmd = [
107
- self.cli_path,
108
- "-m", self.model_path,
109
- "--mmproj", self.mmproj_path,
110
- "--image", tmp_img_path,
111
- "-p", "Extract all text from this image. Be precise and include all visible text.",
112
- "--temp", str(temperature),
113
- "--n-predict", str(max_tokens),
114
- # "--log-disable" # Removed as it suppresses output
115
- ]
116
-
117
- # Run CLI
118
- result = subprocess.run(cmd, capture_output=True, text=True)
119
-
120
- if result.returncode != 0:
121
- print(f"CLI Error: {result.stderr}")
122
- raise RuntimeError(f"llama-mtmd-cli failed: {result.stderr}")
123
-
124
- # stdout contains the generated text, stderr contains logs
125
- return result.stdout.strip()
126
-
127
- finally:
128
- if os.path.exists(tmp_img_path):
129
- os.unlink(tmp_img_path)
130
-
131
- def get_backend_info(self) -> dict:
132
- return {
133
- "name": "GGUF (llama-mtmd-cli)",
134
- "device": "Metal (via CLI)",
135
- "model_path": self.model_path or "not found",
136
- "mmproj_path": self.mmproj_path or "not found",
137
- "cli_path": self.cli_path
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space/backends/pytorch_backend.py DELETED
@@ -1,136 +0,0 @@
1
- """
2
- PyTorch backend for LightOnOCR-1B.
3
- Uses Mistral3ForConditionalGeneration with custom weight remapping.
4
- """
5
-
6
- import torch
7
- import platform
8
- from pathlib import Path
9
- from PIL import Image
10
- from transformers import AutoConfig, PixtralProcessor, Mistral3ForConditionalGeneration
11
- from safetensors.torch import load_file
12
- from huggingface_hub import hf_hub_download
13
-
14
- from . import OCRBackend
15
-
16
-
17
- class PyTorchBackend(OCRBackend):
18
- """PyTorch-based OCR backend using transformers."""
19
-
20
- def __init__(self):
21
- self.model = None
22
- self.processor = None
23
- self.device = None
24
- self.dtype = None
25
- self.model_id = "lightonai/LightOnOCR-1B-1025"
26
-
27
- def load_model(self):
28
- """Load the PyTorch model with custom weight remapping."""
29
- if self.model is not None:
30
- return # Already loaded
31
-
32
- print(f"Loading {self.model_id} (PyTorch backend)...")
33
-
34
- # Load processor
35
- self.processor = PixtralProcessor.from_pretrained(self.model_id, trust_remote_code=True)
36
-
37
- # Instantiate model with config
38
- config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
39
- self.model = Mistral3ForConditionalGeneration(config)
40
-
41
- # Download and remap weights
42
- print(" Downloading and remapping weights...")
43
- weights_path = hf_hub_download(repo_id=self.model_id, filename="model.safetensors")
44
- state_dict = load_file(weights_path)
45
-
46
- new_state_dict = {}
47
- for k, v in state_dict.items():
48
- new_key = k
49
- if "vision_encoder" in k:
50
- new_key = k.replace("vision_encoder", "vision_tower")
51
- if "vision_projection" in k:
52
- new_key = k.replace("vision_projection", "multi_modal_projector")
53
- new_state_dict[new_key] = v
54
-
55
- self.model.load_state_dict(new_state_dict, strict=False)
56
-
57
- # Determine device
58
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
59
- if platform.system() == "Darwin" and "arm" in platform.machine().lower():
60
- self.device = "mps"
61
-
62
- # MPS has issues with float16, use float32
63
- if self.device == "mps":
64
- self.dtype = torch.float32
65
- else:
66
- self.dtype = torch.float16 if self.device == "cuda" else torch.float32
67
-
68
- self.model = self.model.to(device=self.device, dtype=self.dtype)
69
- self.model.eval()
70
-
71
- print(f" Model loaded on {self.device} ({self.dtype})")
72
-
73
- def process_image(self, image: Image.Image, temperature: float = 0.1, max_tokens: int = 1024) -> str:
74
- """Process image using PyTorch model."""
75
- if self.model is None:
76
- self.load_model()
77
-
78
- messages = [
79
- {
80
- "role": "user",
81
- "content": [
82
- {"type": "image", "image": image},
83
- {"type": "text", "text": "Extract all text from this image. Be precise and include all visible text."}
84
- ]
85
- }
86
- ]
87
-
88
- prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
89
- inputs = self.processor(text=prompt, images=image, return_tensors="pt")
90
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
91
-
92
- # Ensure pixel_values match model dtype (critical for MPS)
93
- if 'pixel_values' in inputs:
94
- inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
95
-
96
- # Configure generation parameters (aggressive anti-repetition for HF Space)
97
- do_sample = temperature > 0.0
98
- gen_kwargs = {
99
- "max_new_tokens": max_tokens,
100
- "pad_token_id": self.processor.tokenizer.eos_token_id,
101
- "eos_token_id": self.processor.tokenizer.eos_token_id,
102
- "repetition_penalty": 1.5, # Increased from 1.2
103
- "early_stopping": True,
104
- }
105
-
106
- if do_sample:
107
- gen_kwargs["temperature"] = temperature
108
- gen_kwargs["do_sample"] = True
109
- else:
110
- gen_kwargs["do_sample"] = False
111
-
112
- with torch.no_grad():
113
- generated_ids = self.model.generate(**inputs, **gen_kwargs)
114
-
115
- # CRITICAL: Decode only NEW tokens (skip input prompt)
116
- input_len = inputs['input_ids'].shape[1]
117
- new_tokens = generated_ids[:, input_len:]
118
- generated_text = self.processor.batch_decode(new_tokens, skip_special_tokens=True)[0]
119
-
120
- # Post-processing: Clean any remaining artifacts
121
- # Remove prompt instruction if it leaked through
122
- instruction = "Extract all text from this image. Be precise and include all visible text."
123
- if instruction in generated_text:
124
- generated_text = generated_text.split(instruction)[-1].strip()
125
-
126
- return generated_text
127
-
128
- def get_backend_info(self) -> dict:
129
- """Return backend information."""
130
- return {
131
- "name": "PyTorch",
132
- "device": str(self.device) if self.device else "not loaded",
133
- "dtype": str(self.dtype) if self.dtype else "not loaded",
134
- "model_id": self.model_id,
135
- "loaded": self.model is not None
136
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space/requirements.txt DELETED
@@ -1,10 +0,0 @@
1
- gradio==5.42.0
2
- pillow>=10.3.0,<11
3
- pypdfium2==4.30.0
4
- # requests>=2.31.0,<3 # Already in base image usually, but good to keep
5
- huggingface_hub>=0.24.0
6
- torch>=2.0.0
7
- transformers>=4.36.0
8
- accelerate>=0.26.0
9
- safetensors>=0.4.0
10
- spaces==0.30.0
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -9,3 +9,4 @@ accelerate>=0.26.0
9
  safetensors>=0.4.0
10
  # llama-cpp-python is optional for GGUF backend support (or use local build)
11
  # llama-cpp-python>=0.3.0
 
 
9
  safetensors>=0.4.0
10
  # llama-cpp-python is optional for GGUF backend support (or use local build)
11
  # llama-cpp-python>=0.3.0
12
+ spaces==0.30.0