reygml commited on
Commit
dc9ded5
·
1 Parent(s): 43e6aa2

add monitoring

Browse files
Files changed (2) hide show
  1. app.py +37 -24
  2. util.py +153 -38
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # app.py
2
- import asyncio
3
  from typing import List, Optional
4
 
5
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
@@ -8,8 +8,7 @@ import uvicorn
8
 
9
  from util import get_runner, SmolVLMRunner
10
 
11
-
12
- app = FastAPI(title="SmolVLM Inference API", version="1.0.0")
13
  _runner: Optional[SmolVLMRunner] = None
14
 
15
 
@@ -40,56 +39,70 @@ async def generate_from_files(
40
  temperature: Optional[float] = Form(None),
41
  top_p: Optional[float] = Form(None),
42
  ):
43
- """
44
- Multipart form endpoint:
45
- - prompt: str
46
- - images: one or more image files (image/*)
47
- """
48
  if not images:
49
  raise HTTPException(status_code=400, detail="At least one image must be provided.")
50
 
51
- # Read all files into memory (simple & fine for moderate sizes)
 
 
 
52
  blobs = []
53
  for f in images:
54
  if not f.content_type or not f.content_type.startswith("image/"):
55
  raise HTTPException(status_code=415, detail=f"Unsupported file type: {f.content_type}")
56
  blobs.append(await f.read())
57
-
58
  pil_images = _runner.load_pil_from_bytes(blobs)
59
- text = _runner.generate(
 
 
60
  prompt=prompt,
61
  images=pil_images,
62
  max_new_tokens=max_new_tokens,
63
  temperature=temperature,
64
  top_p=top_p,
 
65
  )
66
- return {"text": text}
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  @app.post("/generate_urls")
70
  async def generate_from_urls(req: URLRequest):
71
- """
72
- JSON endpoint:
73
- {
74
- "prompt": "...",
75
- "image_urls": ["https://...","https://..."],
76
- "max_new_tokens": 300,
77
- "temperature": 0.2,
78
- "top_p": 0.95
79
- }
80
- """
81
  if len(req.image_urls) == 0:
82
  raise HTTPException(status_code=400, detail="At least one image URL is required.")
83
 
 
84
  pil_images = _runner.load_pil_from_urls([str(u) for u in req.image_urls])
85
- text = _runner.generate(
 
 
86
  prompt=req.prompt,
87
  images=pil_images,
88
  max_new_tokens=req.max_new_tokens,
89
  temperature=req.temperature,
90
  top_p=req.top_p,
 
91
  )
92
- return {"text": text}
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  if __name__ == "__main__":
 
1
  # app.py
2
+ from time import perf_counter
3
  from typing import List, Optional
4
 
5
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 
8
 
9
  from util import get_runner, SmolVLMRunner
10
 
11
+ app = FastAPI(title="SmolVLM Inference API", version="1.1.0")
 
12
  _runner: Optional[SmolVLMRunner] = None
13
 
14
 
 
39
  temperature: Optional[float] = Form(None),
40
  top_p: Optional[float] = Form(None),
41
  ):
 
 
 
 
 
42
  if not images:
43
  raise HTTPException(status_code=400, detail="At least one image must be provided.")
44
 
45
+ t_req_start = perf_counter()
46
+
47
+ # Read files
48
+ t_load_start = perf_counter()
49
  blobs = []
50
  for f in images:
51
  if not f.content_type or not f.content_type.startswith("image/"):
52
  raise HTTPException(status_code=415, detail=f"Unsupported file type: {f.content_type}")
53
  blobs.append(await f.read())
 
54
  pil_images = _runner.load_pil_from_bytes(blobs)
55
+ t_load_end = perf_counter()
56
+
57
+ text, inner_metrics = _runner.generate(
58
  prompt=prompt,
59
  images=pil_images,
60
  max_new_tokens=max_new_tokens,
61
  temperature=temperature,
62
  top_p=top_p,
63
+ return_stats=True,
64
  )
65
+
66
+ t_req_end = perf_counter()
67
+ metrics = {
68
+ **inner_metrics,
69
+ "request_ms": {
70
+ "image_load": round((t_load_end - t_load_start) * 1000.0, 2),
71
+ "end_to_end": round((t_req_end - t_req_start) * 1000.0, 2),
72
+ },
73
+ }
74
+ return {"text": text, "metrics": metrics}
75
 
76
 
77
  @app.post("/generate_urls")
78
  async def generate_from_urls(req: URLRequest):
79
+ t_req_start = perf_counter()
80
+
 
 
 
 
 
 
 
 
81
  if len(req.image_urls) == 0:
82
  raise HTTPException(status_code=400, detail="At least one image URL is required.")
83
 
84
+ t_load_start = perf_counter()
85
  pil_images = _runner.load_pil_from_urls([str(u) for u in req.image_urls])
86
+ t_load_end = perf_counter()
87
+
88
+ text, inner_metrics = _runner.generate(
89
  prompt=req.prompt,
90
  images=pil_images,
91
  max_new_tokens=req.max_new_tokens,
92
  temperature=req.temperature,
93
  top_p=req.top_p,
94
+ return_stats=True,
95
  )
96
+
97
+ t_req_end = perf_counter()
98
+ metrics = {
99
+ **inner_metrics,
100
+ "request_ms": {
101
+ "image_load": round((t_load_end - t_load_start) * 1000.0, 2),
102
+ "end_to_end": round((t_req_end - t_req_start) * 1000.0, 2),
103
+ },
104
+ }
105
+ return {"text": text, "metrics": metrics}
106
 
107
 
108
  if __name__ == "__main__":
util.py CHANGED
@@ -1,57 +1,85 @@
1
-
2
- # util.py (patched cache handling for HF Spaces)
3
  import os
4
  from pathlib import Path
 
 
 
 
 
 
 
 
 
5
 
6
- # Put every cache under /tmp (always writable in Spaces)
7
  CACHE_DIR = os.getenv("HF_CACHE_DIR", "/tmp/hf-cache")
8
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
9
-
10
- # Make sure libraries don't fall back to "~/.cache" -> "/.cache"
11
  os.environ.setdefault("HF_HOME", CACHE_DIR)
12
  os.environ.setdefault("TRANSFORMERS_CACHE", CACHE_DIR)
13
  os.environ.setdefault("HUGGINGFACE_HUB_CACHE", CACHE_DIR)
14
  os.environ.setdefault("XDG_CACHE_HOME", CACHE_DIR)
15
  os.environ.setdefault("TORCH_HOME", CACHE_DIR)
16
 
17
- import threading
18
- from io import BytesIO
19
- from typing import List, Sequence
20
- import torch
21
- from PIL import Image
22
- from transformers import AutoProcessor, AutoModelForVision2Seq
23
- from transformers.image_utils import load_image as hf_load_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  class SmolVLMRunner:
 
 
27
  def __init__(self, model_id: str | None = None, device: str | None = None):
28
  self.model_id = model_id or os.getenv("SMOLVLM_MODEL_ID", "HuggingFaceTB/SmolVLM-Instruct")
29
- self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
30
- self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
31
 
32
- # Use the writable cache dir explicitly
 
 
 
 
 
 
 
 
 
 
 
 
33
  self.processor = AutoProcessor.from_pretrained(self.model_id, cache_dir=CACHE_DIR)
 
 
 
 
 
 
34
 
35
- attn_impl = "flash_attention_2" if self.device == "cuda" else "eager"
36
  try:
37
- self.model = AutoModelForVision2Seq.from_pretrained(
38
- self.model_id,
39
- torch_dtype=self.dtype,
40
- _attn_implementation=attn_impl,
41
- cache_dir=CACHE_DIR,
42
- ).to(self.device)
43
  except Exception:
44
- # Fallback if flash-attn isn't available in the environment
45
- self.model = AutoModelForVision2Seq.from_pretrained(
46
- self.model_id,
47
- torch_dtype=self.dtype,
48
- _attn_implementation="eager",
49
- cache_dir=CACHE_DIR,
50
- ).to(self.device)
51
 
52
  self.model.eval()
53
  self._lock = threading.Lock()
54
 
 
55
  @staticmethod
56
  def _ensure_rgb(img: Image.Image) -> Image.Image:
57
  return img.convert("RGB") if img.mode != "RGB" else img
@@ -64,30 +92,118 @@ class SmolVLMRunner:
64
  def load_pil_from_bytes(cls, blobs: Sequence[bytes]) -> List[Image.Image]:
65
  return [cls._ensure_rgb(Image.open(BytesIO(b))) for b in blobs]
66
 
67
- def generate(self, prompt: str, images: Sequence[Image.Image], max_new_tokens: int = 300,
68
- temperature: float | None = None, top_p: float | None = None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  content = [{"type": "image"} for _ in images] + [{"type": "text", "text": prompt}]
70
  messages = [{"role": "user", "content": content}]
71
  chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
72
 
 
73
  inputs = self.processor(text=chat_prompt, images=list(images), return_tensors="pt")
74
  inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
 
75
 
 
76
  gen_kwargs = dict(max_new_tokens=max_new_tokens)
77
  if temperature is not None:
78
  gen_kwargs["temperature"] = float(temperature)
79
  if top_p is not None:
80
  gen_kwargs["top_p"] = float(top_p)
81
 
82
- with self._lock, torch.inference_mode():
83
- generated_ids = self.model.generate(**inputs, **gen_kwargs)
 
84
 
85
- text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
 
 
 
 
 
 
 
 
86
  if text.startswith("Assistant:"):
87
  text = text[len("Assistant:"):].strip()
88
- return text
89
-
90
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  _runner_singleton = None
92
  def get_runner():
93
  global _runner_singleton
@@ -95,4 +211,3 @@ def get_runner():
95
  _runner_singleton = SmolVLMRunner()
96
  return _runner_singleton
97
 
98
-
 
1
+ # util.py (Spaces-safe + metrics)
 
2
  import os
3
  from pathlib import Path
4
+ from time import perf_counter
5
+ import threading
6
+ from io import BytesIO
7
+ from typing import List, Sequence, Tuple, Dict, Any
8
+
9
+ import torch
10
+ from PIL import Image
11
+ from transformers import AutoProcessor, AutoModelForVision2Seq
12
+ from transformers.image_utils import load_image as hf_load_image
13
 
14
+ # ---- Writable caches (HF Spaces safe) ----
15
  CACHE_DIR = os.getenv("HF_CACHE_DIR", "/tmp/hf-cache")
16
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 
 
17
  os.environ.setdefault("HF_HOME", CACHE_DIR)
18
  os.environ.setdefault("TRANSFORMERS_CACHE", CACHE_DIR)
19
  os.environ.setdefault("HUGGINGFACE_HUB_CACHE", CACHE_DIR)
20
  os.environ.setdefault("XDG_CACHE_HOME", CACHE_DIR)
21
  os.environ.setdefault("TORCH_HOME", CACHE_DIR)
22
 
23
+
24
+ def _has_flash_attn() -> bool:
25
+ try:
26
+ import flash_attn # noqa: F401
27
+ return True
28
+ except Exception:
29
+ return False
30
+
31
+
32
+ def _pick_backend_and_dtype():
33
+ if not torch.cuda.is_available():
34
+ return "eager", torch.float32, "cpu"
35
+
36
+ major, _ = torch.cuda.get_device_capability()
37
+ dev = "cuda"
38
+ bf16_ok = torch.cuda.is_bf16_supported()
39
+ dtype = torch.bfloat16 if bf16_ok else torch.float16
40
+ if major >= 8: # Ampere+
41
+ attn = "flash_attention_2" if _has_flash_attn() else "sdpa"
42
+ else:
43
+ attn = "sdpa"
44
+ return attn, dtype, dev
45
 
46
 
47
  class SmolVLMRunner:
48
+ """Portable wrapper with per-call metrics."""
49
+
50
  def __init__(self, model_id: str | None = None, device: str | None = None):
51
  self.model_id = model_id or os.getenv("SMOLVLM_MODEL_ID", "HuggingFaceTB/SmolVLM-Instruct")
 
 
52
 
53
+ attn_impl, dtype, dev = _pick_backend_and_dtype()
54
+ attn_impl = os.getenv("SMOLVLM_ATTN", attn_impl) # optional override
55
+ self.device = device or dev
56
+ self.dtype = dtype
57
+ self.attn_impl = attn_impl
58
+
59
+ if self.device == "cuda" and self.attn_impl == "sdpa":
60
+ try:
61
+ from torch.backends.cuda import sdp_kernel
62
+ sdp_kernel(enable_flash=False, enable_mem_efficient=True, enable_math=True)
63
+ except Exception:
64
+ pass
65
+
66
  self.processor = AutoProcessor.from_pretrained(self.model_id, cache_dir=CACHE_DIR)
67
+ self.model = AutoModelForVision2Seq.from_pretrained(
68
+ self.model_id,
69
+ torch_dtype=self.dtype,
70
+ _attn_implementation=self.attn_impl,
71
+ cache_dir=CACHE_DIR,
72
+ ).to(self.device)
73
 
 
74
  try:
75
+ self.model.config._attn_implementation = self.attn_impl
 
 
 
 
 
76
  except Exception:
77
+ pass
 
 
 
 
 
 
78
 
79
  self.model.eval()
80
  self._lock = threading.Lock()
81
 
82
+ # ---------- Image utils ----------
83
  @staticmethod
84
  def _ensure_rgb(img: Image.Image) -> Image.Image:
85
  return img.convert("RGB") if img.mode != "RGB" else img
 
92
  def load_pil_from_bytes(cls, blobs: Sequence[bytes]) -> List[Image.Image]:
93
  return [cls._ensure_rgb(Image.open(BytesIO(b))) for b in blobs]
94
 
95
+ # ---------- Inference ----------
96
+ def generate(
97
+ self,
98
+ prompt: str,
99
+ images: Sequence[Image.Image],
100
+ max_new_tokens: int = 300,
101
+ temperature: float | None = None,
102
+ top_p: float | None = None,
103
+ return_stats: bool = False,
104
+ ) -> str | Tuple[str, Dict[str, Any]]:
105
+ """
106
+ Returns str by default.
107
+ If return_stats=True, returns (text, metrics_dict).
108
+ """
109
+ meta = {
110
+ "model_id": self.model_id,
111
+ "device": self.device,
112
+ "dtype": str(self.dtype).replace("torch.", ""),
113
+ "attn_backend": self.attn_impl,
114
+ "image_count": len(images),
115
+ "max_new_tokens": int(max_new_tokens),
116
+ "temperature": None if temperature is None else float(temperature),
117
+ "top_p": None if top_p is None else float(top_p),
118
+ }
119
+
120
+ t0 = perf_counter()
121
  content = [{"type": "image"} for _ in images] + [{"type": "text", "text": prompt}]
122
  messages = [{"role": "user", "content": content}]
123
  chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
124
 
125
+ # Preprocess (tokenize + vision)
126
  inputs = self.processor(text=chat_prompt, images=list(images), return_tensors="pt")
127
  inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
128
+ t_pre_end = perf_counter()
129
 
130
+ # Inference (generate)
131
  gen_kwargs = dict(max_new_tokens=max_new_tokens)
132
  if temperature is not None:
133
  gen_kwargs["temperature"] = float(temperature)
134
  if top_p is not None:
135
  gen_kwargs["top_p"] = float(top_p)
136
 
137
+ if self.device == "cuda":
138
+ torch.cuda.synchronize()
139
+ torch.cuda.reset_peak_memory_stats()
140
 
141
+ with self._lock, torch.inference_mode():
142
+ t_inf_start = perf_counter()
143
+ out_ids = self.model.generate(**inputs, **gen_kwargs)
144
+ if self.device == "cuda":
145
+ torch.cuda.synchronize()
146
+ t_inf_end = perf_counter()
147
+
148
+ # Decode
149
+ text = self.processor.batch_decode(out_ids, skip_special_tokens=True)[0].strip()
150
  if text.startswith("Assistant:"):
151
  text = text[len("Assistant:"):].strip()
152
+ t_dec_end = perf_counter()
153
+
154
+ # Stats
155
+ input_tokens = int(inputs["input_ids"].shape[-1]) if "input_ids" in inputs else None
156
+ total_tokens = int(out_ids.shape[-1]) # includes prompt + generated
157
+ output_tokens = int(total_tokens - (input_tokens or 0)) if input_tokens is not None else None
158
+
159
+ pre_ms = (t_pre_end - t0) * 1000.0
160
+ infer_ms = (t_inf_end - t_inf_start) * 1000.0
161
+ decode_ms = (t_dec_end - t_inf_end) * 1000.0
162
+ total_ms = (t_dec_end - t0) * 1000.0
163
+
164
+ tps_infer = (output_tokens / ((t_inf_end - t_inf_start) + 1e-9)) if output_tokens else None
165
+ tps_total = (
166
+ (output_tokens / ((t_dec_end - t0) + 1e-9)) if output_tokens else None
167
+ )
168
+
169
+ gpu_mem_alloc_mb = gpu_mem_resv_mb = None
170
+ gpu_name = None
171
+ if self.device == "cuda":
172
+ try:
173
+ gpu_mem_alloc_mb = round(torch.cuda.max_memory_allocated() / (1024**2), 2)
174
+ gpu_mem_resv_mb = round(torch.cuda.max_memory_reserved() / (1024**2), 2)
175
+ gpu_name = torch.cuda.get_device_name(torch.cuda.current_device())
176
+ except Exception:
177
+ pass
178
+
179
+ metrics: Dict[str, Any] = {
180
+ **meta,
181
+ "gpu_name": gpu_name,
182
+ "timings_ms": {
183
+ "preprocess": round(pre_ms, 2),
184
+ "inference": round(infer_ms, 2),
185
+ "decode": round(decode_ms, 2),
186
+ "total": round(total_ms, 2),
187
+ },
188
+ "tokens": {
189
+ "input": input_tokens,
190
+ "output": output_tokens,
191
+ "total": total_tokens,
192
+ },
193
+ "throughput": {
194
+ "tokens_per_sec_inference": None if tps_infer is None else round(tps_infer, 2),
195
+ "tokens_per_sec_end_to_end": None if tps_total is None else round(tps_total, 2),
196
+ },
197
+ "gpu_memory_mb": {
198
+ "max_allocated": gpu_mem_alloc_mb,
199
+ "max_reserved": gpu_mem_resv_mb,
200
+ },
201
+ }
202
+
203
+ return (text, metrics) if return_stats else text
204
+
205
+
206
+ # Convenience singleton
207
  _runner_singleton = None
208
  def get_runner():
209
  global _runner_singleton
 
211
  _runner_singleton = SmolVLMRunner()
212
  return _runner_singleton
213