chmielvu commited on
Commit
8dc0e64
·
verified ·
1 Parent(s): 03d98cf

Initial image-processing service

Browse files
Files changed (3) hide show
  1. README.md +17 -5
  2. app.py +330 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,24 @@
1
  ---
2
  title: Image Processing Service
3
- emoji: 🏃
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Image Processing Service
 
 
 
3
  sdk: gradio
4
+ sdk_version: 4.44.1
5
  app_file: app.py
6
  pinned: false
7
+ license: apache-2.0
8
  ---
9
 
10
+ # Image Processing Service (HF Space)
11
+
12
+ CPU-only Gradio Space intended to be called by `multi-llm-gateway` via `gradio_client`.
13
+
14
+ APIs (api_name):
15
+ - `/health`
16
+ - `/prepare_for_openai_vlm`
17
+ - `/prepare_for_openai_vlm_batch`
18
+ - `/embed_images_batch` (768d SigLIP vectors, L2 normalized)
19
+ - `/image_metrics`
20
+ - `/bg_remove` (rembg)
21
+ - `/trim_alpha`
22
+ - `/pack_spritesheet`
23
+
24
+ This Space does **not** write to Qdrant. The gateway owns persistence and routing.
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import hashlib
5
+ import io
6
+ import json
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import gradio as gr
11
+ import numpy as np
12
+ from PIL import Image
13
+
14
+
15
+ # ---- Model (SigLIP 768d) ---------------------------------------------------
16
+
17
+ SIGLIP_MODEL_ID = "google/siglip-base-patch16-224"
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class _Embedder:
22
+ processor: Any
23
+ model: Any
24
+
25
+
26
+ _EMBEDDER: _Embedder | None = None
27
+
28
+
29
+ def _get_embedder() -> _Embedder:
30
+ global _EMBEDDER
31
+ if _EMBEDDER is not None:
32
+ return _EMBEDDER
33
+
34
+ import torch
35
+ from transformers import AutoProcessor, AutoModel
36
+
37
+ processor = AutoProcessor.from_pretrained(SIGLIP_MODEL_ID)
38
+ model = AutoModel.from_pretrained(SIGLIP_MODEL_ID)
39
+ model.eval()
40
+ torch.set_grad_enabled(False)
41
+ _EMBEDDER = _Embedder(processor=processor, model=model)
42
+ return _EMBEDDER
43
+
44
+
45
+ def _to_pil(x: Any) -> Image.Image:
46
+ if isinstance(x, Image.Image):
47
+ return x
48
+ if isinstance(x, dict) and isinstance(x.get("path"), str):
49
+ return Image.open(x["path"]).convert("RGBA")
50
+ if isinstance(x, str):
51
+ return Image.open(x).convert("RGBA")
52
+ raise TypeError(f"Unsupported image input: {type(x).__name__}")
53
+
54
+
55
+ def _sha256_bytes(b: bytes) -> str:
56
+ return hashlib.sha256(b).hexdigest()
57
+
58
+
59
+ def _sha256_image(img: Image.Image) -> str:
60
+ buf = io.BytesIO()
61
+ img.save(buf, format="PNG")
62
+ return _sha256_bytes(buf.getvalue())
63
+
64
+
65
+ def _l2_normalize(v: np.ndarray) -> np.ndarray:
66
+ n = np.linalg.norm(v, axis=-1, keepdims=True)
67
+ n = np.maximum(n, 1e-12)
68
+ return v / n
69
+
70
+
71
+ def _embed_pils(pils: list[Image.Image]) -> list[dict[str, Any]]:
72
+ import torch
73
+
74
+ emb = _get_embedder()
75
+ inputs = emb.processor(images=[p.convert("RGB") for p in pils], return_tensors="pt")
76
+ with torch.no_grad():
77
+ # SigLIP-style models expose get_image_features on the multi-modal wrapper.
78
+ if hasattr(emb.model, "get_image_features"):
79
+ feats = emb.model.get_image_features(**inputs)
80
+ else:
81
+ out = emb.model(**inputs)
82
+ feats = getattr(out, "pooler_output", None) or out.last_hidden_state[:, 0, :]
83
+ feats = feats.detach().cpu().numpy().astype("float32")
84
+ feats = _l2_normalize(feats)
85
+ out: list[dict[str, Any]] = []
86
+ for p, vec in zip(pils, feats):
87
+ out.append(
88
+ {
89
+ "dims": int(vec.shape[0]),
90
+ "norm": "l2",
91
+ "model_id": SIGLIP_MODEL_ID,
92
+ "sha256": _sha256_image(p),
93
+ "vector": vec.tolist(),
94
+ }
95
+ )
96
+ return out
97
+
98
+
99
+ # ---- Metrics / Heuristics ---------------------------------------------------
100
+
101
+ def _dhash(img: Image.Image, size: int = 8) -> str:
102
+ g = img.convert("L").resize((size + 1, size), Image.BILINEAR)
103
+ a = np.asarray(g, dtype=np.int16)
104
+ diff = a[:, 1:] > a[:, :-1]
105
+ bits = "".join("1" if x else "0" for x in diff.flatten().tolist())
106
+ return hex(int(bits, 2))[2:].rjust(size * size // 4, "0")
107
+
108
+
109
+ def _laplacian_var(img: Image.Image) -> float:
110
+ g = img.convert("L")
111
+ a = np.asarray(g, dtype=np.float32)
112
+ k = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32)
113
+ # simple conv2d valid
114
+ h, w = a.shape
115
+ if h < 3 or w < 3:
116
+ return 0.0
117
+ out = (
118
+ a[1 : h - 1, 0 : w - 2] * k[1, 0]
119
+ + a[0 : h - 2, 1 : w - 1] * k[0, 1]
120
+ + a[1 : h - 1, 1 : w - 1] * k[1, 1]
121
+ + a[2:h, 1 : w - 1] * k[2, 1]
122
+ + a[1 : h - 1, 2:w] * k[1, 2]
123
+ )
124
+ return float(np.var(out))
125
+
126
+
127
+ def image_metrics(image: Any) -> str:
128
+ img = _to_pil(image)
129
+ arr = np.asarray(img.convert("RGB"), dtype=np.float32) / 255.0
130
+ has_alpha = img.mode in ("RGBA", "LA")
131
+ alpha_cov = 1.0
132
+ if has_alpha:
133
+ a = np.asarray(img.split()[-1], dtype=np.float32) / 255.0
134
+ alpha_cov = float(np.mean(a > 0.05))
135
+ metrics = {
136
+ "width": img.width,
137
+ "height": img.height,
138
+ "blur_laplacian_var": _laplacian_var(img),
139
+ "contrast_std": float(np.std(arr)),
140
+ "mean_brightness": float(np.mean(arr)),
141
+ "dhash": _dhash(img),
142
+ "has_alpha": bool(has_alpha),
143
+ "alpha_coverage": alpha_cov,
144
+ "sha256": _sha256_image(img),
145
+ }
146
+ return json.dumps(metrics)
147
+
148
+
149
+ # ---- VLM prep (OpenAI image_url data URL) ----------------------------------
150
+
151
+ def _resize_max_side(img: Image.Image, max_side: int) -> Image.Image:
152
+ max_side = int(max_side)
153
+ if max_side <= 0:
154
+ return img
155
+ w, h = img.size
156
+ m = max(w, h)
157
+ if m <= max_side:
158
+ return img
159
+ scale = max_side / float(m)
160
+ nw = max(1, int(round(w * scale)))
161
+ nh = max(1, int(round(h * scale)))
162
+ return img.resize((nw, nh), Image.LANCZOS)
163
+
164
+
165
+ def prepare_for_openai_vlm(image: Any, max_side: int = 768, fmt: str = "webp", quality: int = 85) -> str:
166
+ img = _to_pil(image)
167
+ img = _resize_max_side(img, max_side=max_side)
168
+ fmt = (fmt or "webp").lower()
169
+ quality = int(quality)
170
+
171
+ buf = io.BytesIO()
172
+ mime = "image/webp"
173
+ if fmt == "jpeg" or fmt == "jpg":
174
+ mime = "image/jpeg"
175
+ img.convert("RGB").save(buf, format="JPEG", quality=quality, optimize=True)
176
+ elif fmt == "png":
177
+ mime = "image/png"
178
+ img.save(buf, format="PNG", optimize=True)
179
+ else:
180
+ mime = "image/webp"
181
+ img.convert("RGB").save(buf, format="WEBP", quality=quality, method=6)
182
+
183
+ b = buf.getvalue()
184
+ url = f"data:{mime};base64," + base64.b64encode(b).decode("ascii")
185
+ out = {
186
+ "url": url,
187
+ "mime": mime,
188
+ "width": img.width,
189
+ "height": img.height,
190
+ "sha256": _sha256_bytes(b),
191
+ }
192
+ return json.dumps(out)
193
+
194
+
195
+ def prepare_for_openai_vlm_batch(images: list[Any], max_side: int = 768, fmt: str = "webp", quality: int = 85) -> str:
196
+ out = []
197
+ for x in images or []:
198
+ out.append(json.loads(prepare_for_openai_vlm(x, max_side=max_side, fmt=fmt, quality=quality)))
199
+ return json.dumps(out)
200
+
201
+
202
+ # ---- Background removal + alpha trim ----------------------------------------
203
+
204
+ def bg_remove(image: Any) -> tuple[str, str]:
205
+ from rembg import remove
206
+
207
+ img = _to_pil(image).convert("RGBA")
208
+ buf = io.BytesIO()
209
+ img.save(buf, format="PNG")
210
+ out_bytes = remove(buf.getvalue())
211
+
212
+ # Write to a temp file Gradio can serve
213
+ out_path = "bg_removed.png"
214
+ with open(out_path, "wb") as f:
215
+ f.write(out_bytes)
216
+
217
+ meta = {"method": "rembg", "sha256_in": _sha256_image(img), "sha256_out": _sha256_bytes(out_bytes)}
218
+ return out_path, json.dumps(meta)
219
+
220
+
221
+ def trim_alpha(image: Any) -> tuple[str, str]:
222
+ img = _to_pil(image).convert("RGBA")
223
+ a = np.asarray(img.split()[-1], dtype=np.uint8)
224
+ ys, xs = np.where(a > 0)
225
+ if len(xs) == 0 or len(ys) == 0:
226
+ out_path = "trimmed.png"
227
+ img.save(out_path, format="PNG")
228
+ meta = {"bbox": [0, 0, img.width, img.height], "orig_size": [img.width, img.height]}
229
+ return out_path, json.dumps(meta)
230
+
231
+ x0, x1 = int(xs.min()), int(xs.max())
232
+ y0, y1 = int(ys.min()), int(ys.max())
233
+ # inclusive -> size
234
+ w = x1 - x0 + 1
235
+ h = y1 - y0 + 1
236
+ cropped = img.crop((x0, y0, x0 + w, y0 + h))
237
+
238
+ out_path = "trimmed.png"
239
+ cropped.save(out_path, format="PNG")
240
+ meta = {"bbox": [x0, y0, w, h], "orig_size": [img.width, img.height]}
241
+ return out_path, json.dumps(meta)
242
+
243
+
244
+ # ---- Spritesheet packing ----------------------------------------------------
245
+
246
+ def pack_spritesheet(images: list[Any], names_json: str) -> tuple[str, str]:
247
+ names = []
248
+ try:
249
+ names = json.loads(names_json or "[]")
250
+ except Exception:
251
+ names = []
252
+ if not isinstance(names, list):
253
+ names = []
254
+
255
+ pils = [_to_pil(x).convert("RGBA") for x in (images or [])]
256
+ if not pils:
257
+ return "", json.dumps({"error": "no_images"})
258
+
259
+ # Simple grid packer: fixed columns, max cell size per image.
260
+ cols = min(4, len(pils))
261
+ rows = int(np.ceil(len(pils) / cols))
262
+ cell_w = max(p.width for p in pils)
263
+ cell_h = max(p.height for p in pils)
264
+ sheet = Image.new("RGBA", (cell_w * cols, cell_h * rows), (0, 0, 0, 0))
265
+
266
+ mapping: dict[str, Any] = {"cell": [cell_w, cell_h], "items": {}}
267
+ for i, p in enumerate(pils):
268
+ r = i // cols
269
+ c = i % cols
270
+ x = c * cell_w
271
+ y = r * cell_h
272
+ sheet.alpha_composite(p, (x, y))
273
+ key = str(names[i]) if i < len(names) else f"item_{i}"
274
+ mapping["items"][key] = {"x": x, "y": y, "w": p.width, "h": p.height}
275
+
276
+ out_path = "spritesheet.png"
277
+ sheet.save(out_path, format="PNG")
278
+ return out_path, json.dumps(mapping)
279
+
280
+
281
+ # ---- Public endpoints -------------------------------------------------------
282
+
283
+ def health() -> str:
284
+ return json.dumps({"ok": True, "embed_dims": 768, "model_id": SIGLIP_MODEL_ID})
285
+
286
+
287
+ def embed_images_batch(images: list[Any]) -> str:
288
+ pils = [_to_pil(x) for x in (images or [])]
289
+ out = _embed_pils(pils)
290
+ return json.dumps(out)
291
+
292
+
293
+ with gr.Blocks() as demo:
294
+ gr.Markdown("# Image Processing Service")
295
+
296
+ with gr.Tab("API"):
297
+ inp = gr.File(label="Image", file_types=["image"])
298
+ max_side = gr.Slider(128, 2048, value=768, step=64, label="max_side (VLM prep)")
299
+ fmt = gr.Dropdown(["webp", "jpeg", "png"], value="webp", label="format")
300
+ quality = gr.Slider(10, 100, value=85, step=1, label="quality")
301
+
302
+ out_json = gr.Code(language="json", label="Output JSON")
303
+ out_file = gr.File(label="Output File")
304
+
305
+ gr.Button("Health").click(health, outputs=out_json, api_name="/health")
306
+ gr.Button("Prepare for OpenAI VLM").click(
307
+ prepare_for_openai_vlm, inputs=[inp, max_side, fmt, quality], outputs=out_json, api_name="/prepare_for_openai_vlm"
308
+ )
309
+ gr.Button("Metrics").click(image_metrics, inputs=inp, outputs=out_json, api_name="/image_metrics")
310
+ gr.Button("BG Remove").click(bg_remove, inputs=inp, outputs=[out_file, out_json], api_name="/bg_remove")
311
+ gr.Button("Trim Alpha").click(trim_alpha, inputs=inp, outputs=[out_file, out_json], api_name="/trim_alpha")
312
+
313
+ # Batch endpoints (API-only; UI is minimal)
314
+ batch_inp = gr.Files(label="Images (batch)", file_types=["image"])
315
+ batch_out = gr.Code(language="json", label="Batch JSON")
316
+ gr.Button("Prepare VLM Batch").click(
317
+ prepare_for_openai_vlm_batch, inputs=[batch_inp, max_side, fmt, quality], outputs=batch_out, api_name="/prepare_for_openai_vlm_batch"
318
+ )
319
+ gr.Button("Embed Batch").click(embed_images_batch, inputs=batch_inp, outputs=batch_out, api_name="/embed_images_batch")
320
+
321
+ # Spritesheet pack
322
+ names = gr.Textbox(label="Names JSON", value='["neutral","happy"]')
323
+ sheet_file = gr.File(label="Spritesheet PNG")
324
+ sheet_map = gr.Code(language="json", label="Spritesheet Map")
325
+ gr.Button("Pack Spritesheet").click(pack_spritesheet, inputs=[batch_inp, names], outputs=[sheet_file, sheet_map], api_name="/pack_spritesheet")
326
+
327
+
328
+ if __name__ == "__main__":
329
+ demo.queue(default_concurrency_limit=2, max_size=64).launch()
330
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ numpy>=2.0.0
3
+ pillow>=10.0.0
4
+ torch>=2.2.0
5
+ transformers>=4.45.0
6
+ rembg>=2.0.60
7
+ onnxruntime>=1.17.0
8
+ httpx>=0.27.0