Rajhuggingface4253 commited on
Commit
6480984
·
verified ·
1 Parent(s): 35d5417

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1163 -0
app.py ADDED
@@ -0,0 +1,1163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import io
4
+ import json
5
+ import logging
6
+ import time
7
+ import uuid
8
+ import threading
9
+ from contextlib import asynccontextmanager
10
+ from typing import AsyncGenerator, Dict, List, Optional, Union
11
+ from pathlib import Path
12
+
13
+ import numpy as np
14
+ import onnxruntime as ort
15
+ from fastapi import FastAPI, HTTPException, Request, UploadFile, File
16
+ from fastapi.middleware.cors import CORSMiddleware
17
+ from fastapi.responses import JSONResponse
18
+ from huggingface_hub import hf_hub_download, list_repo_files
19
+ from pydantic import BaseModel, Field
20
+ from sse_starlette.sse import EventSourceResponse
21
+ from transformers import AutoImageProcessor, PreTrainedTokenizerFast
22
+ from PIL import Image
23
+ import aiohttp
24
+
25
+ from config import settings
26
+
27
+ # Configure logging
28
+ logging.basicConfig(
29
+ level=getattr(logging, settings.log_level.upper()),
30
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # ==============================================================================
36
+ # Pydantic Models for OpenAI-compatible API
37
+ # ==============================================================================
38
+
39
+ class ImageContent(BaseModel):
40
+ type: str = "image"
41
+ image_url: Optional[str] = None # data:image/jpeg;base64,... or URL
42
+
43
+
44
+ class TextContent(BaseModel):
45
+ type: str = "text"
46
+ text: str
47
+
48
+
49
+ class VisionMessage(BaseModel):
50
+ role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
51
+ content: Union[str, List[Union[ImageContent, TextContent, dict]]] = Field(..., description="Message content")
52
+
53
+
54
+ class VisionCompletionRequest(BaseModel):
55
+ model: str = Field(default="lfm-vision", description="Model identifier")
56
+ messages: List[VisionMessage] = Field(..., description="Conversation messages")
57
+ temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
58
+ top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
59
+ top_k: Optional[int] = Field(default=None, ge=0)
60
+ max_tokens: Optional[int] = Field(default=None, ge=1)
61
+ stream: bool = Field(default=False, description="Enable streaming response")
62
+ stop: Optional[Union[str, List[str]]] = Field(default=None)
63
+
64
+
65
+ class ChatMessage(BaseModel):
66
+ role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
67
+ content: str = Field(..., description="Message content")
68
+
69
+
70
+ class ChatCompletionRequest(BaseModel):
71
+ model: str = Field(default="lfm-vision", description="Model identifier")
72
+ messages: List[ChatMessage] = Field(..., description="Conversation messages")
73
+ temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
74
+ top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
75
+ top_k: Optional[int] = Field(default=None, ge=0)
76
+ max_tokens: Optional[int] = Field(default=None, ge=1)
77
+ stream: bool = Field(default=False, description="Enable streaming response")
78
+
79
+
80
+ class ChatCompletionChoice(BaseModel):
81
+ index: int
82
+ message: ChatMessage
83
+ finish_reason: Optional[str] = None
84
+
85
+
86
+ class ChatCompletionResponse(BaseModel):
87
+ id: str
88
+ object: str = "chat.completion"
89
+ created: int
90
+ model: str
91
+ choices: List[ChatCompletionChoice]
92
+ usage: Dict[str, int]
93
+
94
+
95
+ class ModelInfo(BaseModel):
96
+ id: str
97
+ object: str = "model"
98
+ created: int
99
+ owned_by: str = "liquid-ai"
100
+
101
+
102
+ class ModelListResponse(BaseModel):
103
+ object: str = "list"
104
+ data: List[ModelInfo]
105
+
106
+
107
+ # ==============================================================================
108
+ # ONNX Vision Model Manager
109
+ # ==============================================================================
110
+
111
+ # ONNX dtype mapping
112
+ ONNX_DTYPE = {
113
+ "tensor(float)": np.float32,
114
+ "tensor(float16)": np.float16,
115
+ "tensor(int64)": np.int64
116
+ }
117
+
118
+
119
+ class Lfm2VlProcessorWrapper:
120
+ """
121
+ Custom processor wrapper that combines ImageProcessor + Tokenizer.
122
+ This bypasses the AutoProcessor tokenizer auto-detection bug in LFM models.
123
+ """
124
+
125
+ def __init__(self, image_processor, tokenizer):
126
+ self.image_processor = image_processor
127
+ self.tokenizer = tokenizer
128
+
129
+ def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs):
130
+ """
131
+ Apply chat template for vision-language model.
132
+ Converts vision message format [{"type": "image"}, {"type": "text", "text": "..."}]
133
+ to text with <image> placeholders as expected by the tokenizer.
134
+ """
135
+ # Transform vision messages to text format
136
+ text_messages = []
137
+ for msg in messages:
138
+ role = msg.get("role", "user") if isinstance(msg, dict) else getattr(msg, "role", "user")
139
+ content = msg.get("content", "") if isinstance(msg, dict) else getattr(msg, "content", "")
140
+
141
+ if isinstance(content, list):
142
+ # Vision message format: [{"type": "image"}, {"type": "text", "text": "..."}]
143
+ text_parts = []
144
+ for item in content:
145
+ if isinstance(item, dict):
146
+ item_type = item.get("type", "")
147
+ if item_type == "image":
148
+ text_parts.append("<image>")
149
+ elif item_type == "text":
150
+ text_parts.append(item.get("text", ""))
151
+ else:
152
+ text_parts.append(str(item))
153
+ content = "".join(text_parts)
154
+
155
+ text_messages.append({"role": role, "content": content})
156
+
157
+ return self.tokenizer.apply_chat_template(
158
+ text_messages,
159
+ add_generation_prompt=add_generation_prompt,
160
+ tokenize=tokenize,
161
+ **kwargs
162
+ )
163
+
164
+ def __call__(self, images=None, text=None, **kwargs):
165
+ """
166
+ Process images and text for the vision-language model.
167
+
168
+ CRITICAL: The vision encoder produces N image embeddings (e.g., 256 for a 512x512 image).
169
+ Each embedding needs its own <image> token position in input_ids.
170
+
171
+ This method:
172
+ 1. Processes images FIRST to determine N (number of image tokens)
173
+ 2. Expands single <image> in text to N consecutive <image> tokens
174
+ 3. Tokenizes the expanded text
175
+
176
+ Returns a dict with pixel_values, input_ids, attention_mask, etc.
177
+ """
178
+ result = {}
179
+ return_tensors = kwargs.pop('return_tensors', None)
180
+ num_image_tokens = 0
181
+
182
+ # Step 1: Process images FIRST to get the number of image tokens
183
+ if images is not None:
184
+ image_outputs = self.image_processor(images=images, return_tensors=return_tensors)
185
+ result.update(image_outputs)
186
+
187
+ # Calculate number of image tokens from pixel_values shape
188
+ # pixel_values shape: [batch, num_patches, hidden_dim]
189
+ # The MLP projector in LFM2.5-VL reduces patches by factor of 4
190
+ # Reference: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B
191
+ if 'pixel_values' in image_outputs:
192
+ pv = image_outputs['pixel_values']
193
+ num_patches = pv.shape[1] if hasattr(pv, 'shape') else pv.size(1)
194
+ # MLP projector reduces by factor of 4: 1024 patches → 256 tokens
195
+ num_image_tokens = num_patches // 4
196
+ logger.debug(f"Image processing: {num_patches} patches → {num_image_tokens} image tokens")
197
+
198
+ # Step 2: Expand <image> placeholder(s) to match token count
199
+ if text is not None:
200
+ # Ensure text is a string
201
+ if isinstance(text, list):
202
+ text = text[0] if len(text) == 1 else " ".join(text)
203
+
204
+ # Expand each <image> placeholder to N <image> tokens
205
+ if num_image_tokens > 0 and "<image>" in text:
206
+ # Count existing <image> placeholders
207
+ image_count = text.count("<image>")
208
+ # Each placeholder represents one image, expand to num_image_tokens
209
+ tokens_per_image = num_image_tokens // image_count if image_count > 0 else num_image_tokens
210
+ expanded_image = "<image>" * tokens_per_image
211
+ text = text.replace("<image>", expanded_image)
212
+ logger.debug(f"Expanded {image_count} <image> placeholder(s) to {tokens_per_image} tokens each")
213
+
214
+ text_outputs = self.tokenizer(
215
+ text,
216
+ return_tensors=return_tensors,
217
+ padding=kwargs.get('padding', False),
218
+ truncation=kwargs.get('truncation', False),
219
+ max_length=kwargs.get('max_length', None)
220
+ )
221
+ result.update(text_outputs)
222
+
223
+ return result
224
+
225
+
226
+ class ONNXVisionModelManager:
227
+ """Manages ONNX Vision-Language model with 3 sessions: embed_tokens, embed_images, decoder."""
228
+
229
+ def __init__(self):
230
+ self._embed_tokens = None
231
+ self._embed_images = None
232
+ self._decoder = None
233
+ self._processor = None
234
+ self._cache_template = None
235
+ self._lock = threading.Lock()
236
+
237
+ @property
238
+ def is_loaded(self) -> bool:
239
+ return all([self._embed_tokens, self._embed_images, self._decoder])
240
+
241
+ def download_models(self) -> Dict[str, str]:
242
+ """Download ONNX model files from HuggingFace."""
243
+ model_id = settings.model_id
244
+ encoder_var = settings.encoder_variant
245
+ decoder_var = settings.decoder_variant
246
+
247
+ logger.info(f"Downloading model: {model_id}")
248
+ logger.info(f" Encoder variant: {encoder_var}")
249
+ logger.info(f" Decoder variant: {decoder_var}")
250
+
251
+ paths = {}
252
+
253
+ # Download embed_tokens (use same variant as encoder or fp16)
254
+ embed_suffix = f"_fp16" if encoder_var in ["fp16", "q8", "q4"] else ""
255
+ paths["embed_tokens"] = hf_hub_download(model_id, f"onnx/embed_tokens{embed_suffix}.onnx")
256
+
257
+ # Download embed_images (vision encoder)
258
+ img_suffix = f"_{encoder_var}" if encoder_var != "fp32" else ""
259
+ paths["embed_images"] = hf_hub_download(model_id, f"onnx/embed_images{img_suffix}.onnx")
260
+
261
+ # Download decoder
262
+ dec_suffix = f"_{decoder_var}" if decoder_var != "fp32" else ""
263
+ paths["decoder"] = hf_hub_download(model_id, f"onnx/decoder{dec_suffix}.onnx")
264
+
265
+ # Download all data files - use exact prefix matching to avoid downloading wrong variants
266
+ # Expected files for selected variants only (e.g., decoder_q8.onnx_data, not decoder.onnx_data)
267
+ expected_prefixes = [
268
+ f"onnx/embed_tokens{embed_suffix}.onnx_data",
269
+ f"onnx/embed_images{img_suffix}.onnx_data",
270
+ f"onnx/decoder{dec_suffix}.onnx_data"
271
+ ]
272
+
273
+ for f in list_repo_files(model_id):
274
+ if f.startswith("onnx/") and ".onnx_data" in f:
275
+ # Check if this file STARTS WITH one of our expected prefixes
276
+ # This handles split files like decoder_q8.onnx_data, decoder_q8.onnx_data_1, etc.
277
+ if any(f.startswith(prefix) for prefix in expected_prefixes):
278
+ logger.info(f"Downloading: {f}")
279
+ hf_hub_download(model_id, f)
280
+
281
+ return paths
282
+
283
+ def load_model(self) -> None:
284
+ """Load the ONNX models and processor."""
285
+ with self._lock:
286
+ if self.is_loaded:
287
+ return
288
+
289
+ logger.info("=" * 60)
290
+ logger.info("Loading LFM2.5-VL-1.6B Vision-Language ONNX model...")
291
+ logger.info(f"Model: {settings.model_id}")
292
+ logger.info(f"Encoder: {settings.encoder_variant} (Q8 = ~95% accuracy)")
293
+ logger.info(f"Decoder: {settings.decoder_variant}")
294
+ logger.info("=" * 60)
295
+
296
+ start_time = time.time()
297
+
298
+ # Download models
299
+ paths = self.download_models()
300
+
301
+ # Configure ONNX Runtime for CPU
302
+ sess_options = ort.SessionOptions()
303
+ sess_options.intra_op_num_threads = settings.num_threads
304
+ sess_options.inter_op_num_threads = settings.num_threads
305
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
306
+
307
+ # Load ONNX sessions
308
+ self._embed_tokens = ort.InferenceSession(
309
+ paths["embed_tokens"],
310
+ sess_options=sess_options,
311
+ providers=['CPUExecutionProvider']
312
+ )
313
+
314
+ self._embed_images = ort.InferenceSession(
315
+ paths["embed_images"],
316
+ sess_options=sess_options,
317
+ providers=['CPUExecutionProvider']
318
+ )
319
+
320
+ self._decoder = ort.InferenceSession(
321
+ paths["decoder"],
322
+ sess_options=sess_options,
323
+ providers=['CPUExecutionProvider']
324
+ )
325
+
326
+ # Load processor components separately to bypass TokenizersBackend bug
327
+ # LFM models incorrectly specify TokenizersBackend as tokenizer_class
328
+ logger.info("Loading image processor...")
329
+ image_processor = AutoImageProcessor.from_pretrained(
330
+ settings.model_id,
331
+ trust_remote_code=True
332
+ )
333
+
334
+ logger.info("Loading tokenizer with PreTrainedTokenizerFast...")
335
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(
336
+ settings.model_id,
337
+ trust_remote_code=True
338
+ )
339
+
340
+ # Create our custom processor wrapper
341
+ self._processor = Lfm2VlProcessorWrapper(
342
+ image_processor=image_processor,
343
+ tokenizer=tokenizer
344
+ )
345
+ logger.info(f"✓ Processor created: {type(self._processor).__name__}")
346
+
347
+ # Initialize cache template for decoder
348
+ self._init_cache_template()
349
+
350
+ load_time = time.time() - start_time
351
+ logger.info("=" * 60)
352
+ logger.info(f"✓ Model loaded in {load_time:.2f}s")
353
+ logger.info(f" Threads: {settings.num_threads}")
354
+ logger.info(f" Provider: CPU")
355
+ logger.info("=" * 60)
356
+
357
+ def _init_cache_template(self) -> None:
358
+ """Initialize KV cache template for decoder."""
359
+ self._cache_template = {}
360
+ for inp in self._decoder.get_inputs():
361
+ if inp.name in {"inputs_embeds", "attention_mask", "position_ids"}:
362
+ continue
363
+
364
+ shape = [d if isinstance(d, int) else 1 for d in inp.shape]
365
+ for i, d in enumerate(inp.shape):
366
+ if isinstance(d, str) and "sequence" in d.lower():
367
+ shape[i] = 0
368
+
369
+ dtype = ONNX_DTYPE.get(inp.type, np.float32)
370
+ self._cache_template[inp.name] = (shape, dtype)
371
+
372
+ def _create_empty_cache(self) -> Dict[str, np.ndarray]:
373
+ """Create a new empty KV cache."""
374
+ return {
375
+ name: np.zeros(shape, dtype=dtype)
376
+ for name, (shape, dtype) in self._cache_template.items()
377
+ }
378
+
379
+ @property
380
+ def processor(self):
381
+ if self._processor is None:
382
+ raise RuntimeError("Processor not loaded")
383
+ return self._processor
384
+
385
+ def process_image(self, image: Image.Image) -> Dict[str, np.ndarray]:
386
+ """Process image to embeddings."""
387
+ # Ensure RGB
388
+ if image.mode != "RGB":
389
+ image = image.convert("RGB")
390
+
391
+ return image
392
+
393
+ def generate(
394
+ self,
395
+ images: List[Image.Image],
396
+ messages: List[dict],
397
+ max_tokens: int = 512,
398
+ temperature: float = 0.1,
399
+ top_k: int = 50,
400
+ top_p: float = 0.1,
401
+ stop_tokens: Optional[List[int]] = None
402
+ ) -> List[int]:
403
+ """Generate tokens using ONNX Vision model."""
404
+ tokenizer = self._processor.tokenizer
405
+
406
+ if stop_tokens is None:
407
+ stop_tokens = [tokenizer.eos_token_id]
408
+
409
+ # Process inputs through processor
410
+ prompt = self._processor.apply_chat_template(messages, add_generation_prompt=True)
411
+ inputs = self._processor(
412
+ images=images if images else None,
413
+ text=prompt,
414
+ return_tensors="pt"
415
+ )
416
+
417
+ # Convert to numpy with correct dtypes
418
+ input_ids = inputs["input_ids"].numpy().astype(np.int64)
419
+
420
+ # Get token embeddings
421
+ token_outputs = self._embed_tokens.run(None, {"input_ids": input_ids})
422
+ token_embeds = token_outputs[0]
423
+
424
+ # Process images if present
425
+ if images and "pixel_values" in inputs:
426
+ pixel_values = inputs["pixel_values"].numpy().astype(np.float32)
427
+ pixel_attention_mask = inputs.get("pixel_attention_mask", None)
428
+ spatial_shapes = inputs.get("spatial_shapes", None)
429
+
430
+ image_feed = {"pixel_values": pixel_values}
431
+ if pixel_attention_mask is not None:
432
+ image_feed["pixel_attention_mask"] = pixel_attention_mask.numpy().astype(np.int64)
433
+ if spatial_shapes is not None:
434
+ image_feed["spatial_shapes"] = spatial_shapes.numpy().astype(np.int64)
435
+
436
+ image_outputs = self._embed_images.run(None, image_feed)
437
+ image_embeds = image_outputs[0]
438
+
439
+ # Replace <image> tokens with image embeddings
440
+ image_token_id = tokenizer.convert_tokens_to_ids("<image>")
441
+ image_positions = np.where(input_ids[0] == image_token_id)[0]
442
+ for i, pos in enumerate(image_positions):
443
+ if i < len(image_embeds):
444
+ token_embeds[0, pos] = image_embeds[i]
445
+
446
+ # Initialize KV cache
447
+ cache = self._create_empty_cache()
448
+ seq_len = token_embeds.shape[1]
449
+ generated_tokens = []
450
+
451
+ for step in range(max_tokens):
452
+ if step == 0:
453
+ embeds = token_embeds.astype(np.float32)
454
+ else:
455
+ last_token = np.array([[generated_tokens[-1]]], dtype=np.int64)
456
+ embeds = self._embed_tokens.run(None, {"input_ids": last_token})[0].astype(np.float32)
457
+
458
+ attn_mask = np.ones((1, seq_len + len(generated_tokens)), dtype=np.int64)
459
+
460
+ feed = {"inputs_embeds": embeds, "attention_mask": attn_mask, **cache}
461
+ outputs = self._decoder.run(None, feed)
462
+
463
+ # Get logits and apply temperature
464
+ logits = outputs[0][0, -1]
465
+
466
+ if temperature > 0:
467
+ logits = logits / temperature
468
+
469
+ # Apply top-k
470
+ if top_k > 0:
471
+ indices_to_remove = np.argsort(logits)[:-top_k]
472
+ logits[indices_to_remove] = -np.inf
473
+
474
+ # Apply top-p (nucleus sampling)
475
+ if top_p < 1.0:
476
+ sorted_indices = np.argsort(logits)[::-1]
477
+ sorted_logits = logits[sorted_indices]
478
+ probs = np.exp(sorted_logits - np.max(sorted_logits))
479
+ probs = probs / probs.sum()
480
+ cumulative_probs = np.cumsum(probs)
481
+ sorted_indices_to_remove = cumulative_probs > top_p
482
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
483
+ sorted_indices_to_remove[0] = False
484
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
485
+ logits[indices_to_remove] = -np.inf
486
+
487
+ # Sample
488
+ probs = np.exp(logits - np.max(logits))
489
+ probs = probs / probs.sum()
490
+ next_token = int(np.random.choice(len(probs), p=probs))
491
+ else:
492
+ next_token = int(np.argmax(logits))
493
+
494
+ generated_tokens.append(next_token)
495
+
496
+ # Update cache
497
+ for i, out in enumerate(self._decoder.get_outputs()[1:], 1):
498
+ name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
499
+ if name in cache:
500
+ cache[name] = outputs[i]
501
+
502
+ if next_token in stop_tokens:
503
+ break
504
+
505
+ return generated_tokens
506
+
507
+ def generate_stream(
508
+ self,
509
+ images: List[Image.Image],
510
+ messages: List[dict],
511
+ max_tokens: int = 2000,
512
+ temperature: float = 0.1,
513
+ top_k: int = 50,
514
+ top_p: float = 0.1,
515
+ stop_tokens: Optional[List[int]] = None
516
+ ):
517
+ """Streaming generation for Vision model."""
518
+ tokenizer = self._processor.tokenizer
519
+
520
+ if stop_tokens is None:
521
+ stop_tokens = [tokenizer.eos_token_id]
522
+
523
+ # Process inputs through processor
524
+ prompt = self._processor.apply_chat_template(messages, add_generation_prompt=True)
525
+ inputs = self._processor(
526
+ images=images if images else None,
527
+ text=prompt,
528
+ return_tensors="pt"
529
+ )
530
+
531
+ # Convert to numpy with correct dtypes
532
+ input_ids = inputs["input_ids"].numpy().astype(np.int64)
533
+
534
+ # Get token embeddings
535
+ token_outputs = self._embed_tokens.run(None, {"input_ids": input_ids})
536
+ token_embeds = token_outputs[0]
537
+
538
+ # Process images if present
539
+ if images and "pixel_values" in inputs:
540
+ pixel_values = inputs["pixel_values"].numpy().astype(np.float32)
541
+ pixel_attention_mask = inputs.get("pixel_attention_mask", None)
542
+ spatial_shapes = inputs.get("spatial_shapes", None)
543
+
544
+ image_feed = {"pixel_values": pixel_values}
545
+ if pixel_attention_mask is not None:
546
+ image_feed["pixel_attention_mask"] = pixel_attention_mask.numpy().astype(np.int64)
547
+ if spatial_shapes is not None:
548
+ image_feed["spatial_shapes"] = spatial_shapes.numpy().astype(np.int64)
549
+
550
+ image_outputs = self._embed_images.run(None, image_feed)
551
+ image_embeds = image_outputs[0]
552
+
553
+ # Replace <image> tokens with image embeddings
554
+ image_token_id = tokenizer.convert_tokens_to_ids("<image>")
555
+ image_positions = np.where(input_ids[0] == image_token_id)[0]
556
+ for i, pos in enumerate(image_positions):
557
+ if i < len(image_embeds):
558
+ token_embeds[0, pos] = image_embeds[i]
559
+
560
+ # Initialize KV cache
561
+ cache = self._create_empty_cache()
562
+ seq_len = token_embeds.shape[1]
563
+ generated_tokens = []
564
+
565
+ # Pre-allocate attention mask
566
+ max_possible_len = seq_len + max_tokens
567
+ attn_mask = np.ones((1, max_possible_len), dtype=np.int64)
568
+
569
+ # Pre-compute flags
570
+ use_temp = temperature > 0
571
+ use_top_k = top_k > 0
572
+ use_top_p = top_p < 1.0
573
+
574
+ feed = {}
575
+
576
+ for step in range(max_tokens):
577
+ current_len = seq_len + step
578
+
579
+ if step == 0:
580
+ embeds = token_embeds.astype(np.float32)
581
+ else:
582
+ last_token = np.array([[generated_tokens[-1]]], dtype=np.int64)
583
+ embeds = self._embed_tokens.run(None, {"input_ids": last_token})[0].astype(np.float32)
584
+
585
+ # Update Feed Dict
586
+ feed.clear()
587
+ feed["inputs_embeds"] = embeds
588
+ feed["attention_mask"] = attn_mask[:, :current_len]
589
+ feed.update(cache)
590
+
591
+ # Inference
592
+ outputs = self._decoder.run(None, feed)
593
+ logits = outputs[0][0, -1]
594
+
595
+ # Sampling
596
+ if use_temp:
597
+ logits /= temperature
598
+
599
+ if use_top_k and top_k < len(logits):
600
+ top_k_idx = np.argpartition(logits, -top_k)[-top_k:]
601
+ mask = np.ones(logits.shape, dtype=bool)
602
+ mask[top_k_idx] = False
603
+ logits[mask] = -np.inf
604
+
605
+ if use_top_p:
606
+ valid_mask = logits > -np.inf
607
+ if valid_mask.any():
608
+ valid_logits = logits[valid_mask]
609
+ valid_indices = np.where(valid_mask)[0]
610
+
611
+ sorted_indices = np.argsort(valid_logits)[::-1]
612
+ sorted_logits = valid_logits[sorted_indices]
613
+
614
+ exp_logits = np.exp(sorted_logits - np.max(sorted_logits))
615
+ probs = exp_logits / exp_logits.sum()
616
+
617
+ cumulative = np.cumsum(probs)
618
+ cutoff = np.searchsorted(cumulative, top_p)
619
+ cutoff = min(cutoff + 1, len(sorted_logits))
620
+
621
+ accepted_indices = sorted_indices[:cutoff]
622
+ accepted_probs = probs[:cutoff]
623
+ accepted_probs /= accepted_probs.sum()
624
+
625
+ sample_idx = np.searchsorted(np.cumsum(accepted_probs), np.random.rand())
626
+ next_token = int(valid_indices[accepted_indices[sample_idx]])
627
+ else:
628
+ next_token = int(np.argmax(logits))
629
+ else:
630
+ valid_mask = logits > -np.inf
631
+ valid_logits = logits[valid_mask]
632
+ valid_indices = np.where(valid_mask)[0]
633
+ exp_logits = np.exp(valid_logits - np.max(valid_logits))
634
+ probs = exp_logits / exp_logits.sum()
635
+ sample_idx = np.searchsorted(np.cumsum(probs), np.random.rand())
636
+ next_token = int(valid_indices[sample_idx])
637
+ else:
638
+ next_token = int(np.argmax(logits))
639
+
640
+ generated_tokens.append(next_token)
641
+ yield next_token
642
+
643
+ if next_token in stop_tokens:
644
+ break
645
+
646
+ # Update Cache
647
+ for i, out in enumerate(self._decoder.get_outputs()[1:], 1):
648
+ name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
649
+ if name in cache:
650
+ cache[name] = outputs[i]
651
+
652
+ def unload(self) -> None:
653
+ """Unload models from memory."""
654
+ with self._lock:
655
+ if self._embed_tokens is not None:
656
+ del self._embed_tokens
657
+ del self._embed_images
658
+ del self._decoder
659
+ del self._processor
660
+ self._embed_tokens = None
661
+ self._embed_images = None
662
+ self._decoder = None
663
+ self._processor = None
664
+ logger.info("Models unloaded")
665
+
666
+
667
+ # Global model manager
668
+ model_manager = ONNXVisionModelManager()
669
+
670
+
671
+ # ==============================================================================
672
+ # Image Processing Utilities
673
+ # ==============================================================================
674
+
675
+ def resize_image_for_model(image: Image.Image, max_dim: int = 512) -> Image.Image:
676
+ """
677
+ Resize image to max dimension while preserving aspect ratio.
678
+ Uses LANCZOS (highest quality) resampling for best visual fidelity.
679
+
680
+ This optimization ensures:
681
+ - Consistent processing time (~3-4s) regardless of input size
682
+ - Single-patch processing (256 tokens) instead of tiling
683
+ - Reduced memory usage
684
+
685
+ Args:
686
+ image: PIL Image to resize
687
+ max_dim: Maximum dimension (width or height), default 512
688
+
689
+ Returns:
690
+ Resized PIL Image (or original if already small enough)
691
+ """
692
+ width, height = image.size
693
+
694
+ # Skip if already small enough
695
+ if width <= max_dim and height <= max_dim:
696
+ logger.debug(f"Image {width}x{height} already within {max_dim}px limit")
697
+ return image
698
+
699
+ # Calculate new dimensions (preserve aspect ratio)
700
+ ratio = min(max_dim / width, max_dim / height)
701
+ new_width = int(width * ratio)
702
+ new_height = int(height * ratio)
703
+
704
+ logger.info(f"Resizing image: {width}x{height} → {new_width}x{new_height} (LANCZOS)")
705
+
706
+ # Resize with high-quality LANCZOS filter
707
+ return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
708
+
709
+ async def load_image_from_url(url: str) -> Image.Image:
710
+ """Load image from URL, convert to RGB, and resize for optimal processing."""
711
+ async with aiohttp.ClientSession() as session:
712
+ async with session.get(url) as response:
713
+ if response.status != 200:
714
+ raise HTTPException(status_code=400, detail=f"Failed to fetch image from URL: {url}")
715
+ data = await response.read()
716
+ image = Image.open(io.BytesIO(data))
717
+ # Convert to RGB to ensure consistent channel format
718
+ if image.mode != 'RGB':
719
+ image = image.convert('RGB')
720
+ # Resize for optimal model processing (max 512x512)
721
+ image = resize_image_for_model(image)
722
+ return image
723
+
724
+
725
+ def load_image_from_base64(data_url: str) -> Image.Image:
726
+ """Load image from base64 data URL, convert to RGB, and resize for optimal processing."""
727
+ # Format: data:image/jpeg;base64,/9j/4AAQ...
728
+ if "," in data_url:
729
+ header, encoded = data_url.split(",", 1)
730
+ else:
731
+ encoded = data_url
732
+
733
+ image_data = base64.b64decode(encoded)
734
+ image = Image.open(io.BytesIO(image_data))
735
+ # Convert to RGB to ensure consistent channel format
736
+ if image.mode != 'RGB':
737
+ image = image.convert('RGB')
738
+ # Resize for optimal model processing (max 512x512)
739
+ image = resize_image_for_model(image)
740
+ return image
741
+
742
+
743
+ async def process_image_content(content: Union[ImageContent, dict]) -> Optional[Image.Image]:
744
+ """Process image content from request."""
745
+ if isinstance(content, dict):
746
+ content = ImageContent(**content)
747
+
748
+ if content.type != "image":
749
+ return None
750
+
751
+ if not content.image_url:
752
+ return None
753
+
754
+ url = content.image_url
755
+
756
+ # Check if it's a base64 data URL
757
+ if url.startswith("data:"):
758
+ return load_image_from_base64(url)
759
+ else:
760
+ # It's a regular URL
761
+ return await load_image_from_url(url)
762
+
763
+
764
+ # ==============================================================================
765
+ # Application Lifecycle
766
+ # ==============================================================================
767
+
768
+ @asynccontextmanager
769
+ async def lifespan(app: FastAPI):
770
+ """Application lifespan handler."""
771
+ logger.info("Starting LFM2.5-VL Vision API Server (ONNX Runtime)...")
772
+
773
+ loop = asyncio.get_event_loop()
774
+ await loop.run_in_executor(None, model_manager.load_model)
775
+
776
+ yield
777
+
778
+ logger.info("Shutting down...")
779
+ model_manager.unload()
780
+
781
+
782
+ # ==============================================================================
783
+ # FastAPI Application
784
+ # ==============================================================================
785
+
786
+ app = FastAPI(
787
+ title=settings.app_name,
788
+ description="Fast CPU inference for LiquidAI LFM2.5-VL-1.6B Vision-Language model using ONNX Runtime",
789
+ version=settings.app_version,
790
+ lifespan=lifespan,
791
+ docs_url="/docs",
792
+ redoc_url="/redoc",
793
+ )
794
+
795
+ origins = [
796
+ "http://127.0.0.1:5500",
797
+ "http://127.0.0.1:5501",
798
+ "http://localhost:5500",
799
+ "http://localhost:5173",
800
+ "https://toolboxesai.com"
801
+ ]
802
+
803
+ app.add_middleware(
804
+ CORSMiddleware,
805
+ allow_origins=origins,
806
+ allow_credentials=True,
807
+ allow_methods=["*"],
808
+ allow_headers=["*"],
809
+ )
810
+
811
+
812
+ @app.middleware("http")
813
+ async def add_cors_for_null_origin(request: Request, call_next):
814
+ """Handle CORS for null origin (when HTML is opened from file://)."""
815
+ origin = request.headers.get("origin", "")
816
+ response = await call_next(request)
817
+
818
+ if origin == "null" or not origin:
819
+ response.headers["Access-Control-Allow-Origin"] = "*"
820
+ response.headers["Access-Control-Allow-Methods"] = "GET, POST, PUT, DELETE, OPTIONS"
821
+ response.headers["Access-Control-Allow-Headers"] = "*"
822
+ response.headers["Access-Control-Expose-Headers"] = "*"
823
+
824
+ return response
825
+
826
+
827
+ # ==============================================================================
828
+ # Helper Functions
829
+ # ==============================================================================
830
+
831
+ def generate_id() -> str:
832
+ return f"chatcmpl-{uuid.uuid4().hex[:12]}"
833
+
834
+
835
+ async def extract_images_and_text(messages: List[VisionMessage]) -> tuple[List[Image.Image], List[dict]]:
836
+ """Extract images and convert messages to processor format."""
837
+ images = []
838
+ processed_messages = []
839
+
840
+ for msg in messages:
841
+ if isinstance(msg.content, str):
842
+ # Simple text message
843
+ processed_messages.append({
844
+ "role": msg.role,
845
+ "content": msg.content
846
+ })
847
+ else:
848
+ # Mixed content (images + text)
849
+ content_parts = []
850
+ for item in msg.content:
851
+ if isinstance(item, dict):
852
+ item_type = item.get("type", "")
853
+ else:
854
+ item_type = item.type
855
+
856
+ if item_type == "image":
857
+ image = await process_image_content(item)
858
+ if image:
859
+ images.append(image)
860
+ content_parts.append({"type": "image"})
861
+ elif item_type == "text":
862
+ text = item.get("text", "") if isinstance(item, dict) else item.text
863
+ content_parts.append({"type": "text", "text": text})
864
+
865
+ processed_messages.append({
866
+ "role": msg.role,
867
+ "content": content_parts
868
+ })
869
+
870
+ return images, processed_messages
871
+
872
+
873
+ async def stream_vision_completion(request: VisionCompletionRequest) -> AsyncGenerator[str, None]:
874
+ """Streaming vision completion."""
875
+ request_id = generate_id()
876
+ created = int(time.time())
877
+
878
+ loop = asyncio.get_running_loop()
879
+ async_queue = asyncio.Queue()
880
+
881
+ # Extract images and process messages
882
+ images, processed_messages = await extract_images_and_text(request.messages)
883
+
884
+ tokenizer = model_manager.processor.tokenizer
885
+
886
+ # Config
887
+ max_tokens = request.max_tokens or settings.max_tokens
888
+ temperature = request.temperature if request.temperature is not None else settings.temperature
889
+ top_k = request.top_k if request.top_k is not None else settings.top_k
890
+ top_p = request.top_p if request.top_p is not None else settings.top_p
891
+
892
+ # Prepare stop tokens
893
+ stop_tokens = [tokenizer.eos_token_id]
894
+ if request.stop:
895
+ if isinstance(request.stop, str):
896
+ encoded = tokenizer.encode(request.stop, add_special_tokens=False)
897
+ if encoded:
898
+ stop_tokens.append(encoded[0])
899
+ elif isinstance(request.stop, list):
900
+ for stop_str in request.stop:
901
+ encoded = tokenizer.encode(stop_str, add_special_tokens=False)
902
+ if encoded:
903
+ stop_tokens.append(encoded[0])
904
+
905
+ def generate_tokens():
906
+ try:
907
+ for token in model_manager.generate_stream(
908
+ images,
909
+ processed_messages,
910
+ max_tokens=max_tokens,
911
+ temperature=temperature,
912
+ top_k=top_k,
913
+ top_p=top_p,
914
+ stop_tokens=stop_tokens
915
+ ):
916
+ loop.call_soon_threadsafe(async_queue.put_nowait, ("token", token))
917
+ except Exception as e:
918
+ logger.error(f"Stream generation error: {e}")
919
+ loop.call_soon_threadsafe(async_queue.put_nowait, ("error", str(e)))
920
+ finally:
921
+ loop.call_soon_threadsafe(async_queue.put_nowait, ("done", None))
922
+
923
+ threading.Thread(target=generate_tokens, daemon=True).start()
924
+
925
+ try:
926
+ while True:
927
+ msg_type, data = await async_queue.get()
928
+
929
+ if msg_type == "token":
930
+ text = tokenizer.decode([data], skip_special_tokens=True)
931
+ if text:
932
+ chunk = {
933
+ "id": request_id,
934
+ "object": "chat.completion.chunk",
935
+ "created": created,
936
+ "model": request.model,
937
+ "choices": [{
938
+ "index": 0,
939
+ "delta": {"content": text},
940
+ "finish_reason": None
941
+ }]
942
+ }
943
+ yield {"data": json.dumps(chunk)}
944
+
945
+ elif msg_type == "done":
946
+ final = {
947
+ "id": request_id,
948
+ "object": "chat.completion.chunk",
949
+ "created": created,
950
+ "model": request.model,
951
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
952
+ }
953
+ yield {"data": json.dumps(final)}
954
+ yield {"data": "[DONE]"}
955
+ break
956
+
957
+ elif msg_type == "error":
958
+ logger.error(f"Stream error: {data}")
959
+ yield {"data": json.dumps({"error": {"message": data}})}
960
+ break
961
+
962
+ except asyncio.CancelledError:
963
+ logger.info(f"Stream cancelled for request {request_id[:8]}")
964
+ raise
965
+ except Exception as e:
966
+ logger.error(f"Streaming error: {e}")
967
+ yield {"data": json.dumps({"error": {"message": str(e)}})}
968
+
969
+
970
+ # ==============================================================================
971
+ # API Endpoints
972
+ # ==============================================================================
973
+
974
+
975
+
976
+
977
+ @app.get("/health")
978
+ async def health():
979
+ if not model_manager.is_loaded:
980
+ raise HTTPException(status_code=503, detail="Model not loaded")
981
+ return {"status": "healthy"}
982
+
983
+
984
+ @app.get("/v1/models", response_model=ModelListResponse)
985
+ async def list_models():
986
+ return ModelListResponse(
987
+ data=[
988
+ ModelInfo(id="lfm-vision", created=int(time.time())),
989
+ ModelInfo(id="lfm-2.5-vl-1.6b-onnx", created=int(time.time()))
990
+ ]
991
+ )
992
+
993
+
994
+ @app.post("/v1/vision/completions")
995
+ async def vision_completions(request: VisionCompletionRequest):
996
+ """Vision-language completion with image support."""
997
+ if not model_manager.is_loaded:
998
+ raise HTTPException(status_code=503, detail="Model not loaded")
999
+
1000
+ if request.stream:
1001
+ return EventSourceResponse(
1002
+ stream_vision_completion(request),
1003
+ media_type="text/event-stream",
1004
+ ping=30000,
1005
+ ping_message_factory=lambda: '{"type": "ping"}'
1006
+ )
1007
+
1008
+ try:
1009
+ # Extract images and process messages
1010
+ images, processed_messages = await extract_images_and_text(request.messages)
1011
+
1012
+ tokenizer = model_manager.processor.tokenizer
1013
+
1014
+ max_tokens = request.max_tokens or settings.max_tokens
1015
+ temperature = request.temperature if request.temperature is not None else settings.temperature
1016
+ top_k = request.top_k if request.top_k is not None else settings.top_k
1017
+ top_p = request.top_p if request.top_p is not None else settings.top_p
1018
+
1019
+ start_time = time.time()
1020
+
1021
+ loop = asyncio.get_event_loop()
1022
+ tokens = await loop.run_in_executor(
1023
+ None,
1024
+ lambda: model_manager.generate(
1025
+ images,
1026
+ processed_messages,
1027
+ max_tokens=max_tokens,
1028
+ temperature=temperature,
1029
+ top_k=top_k,
1030
+ top_p=top_p
1031
+ )
1032
+ )
1033
+
1034
+ response_text = tokenizer.decode(tokens, skip_special_tokens=True)
1035
+ gen_time = time.time() - start_time
1036
+
1037
+ logger.debug(f"Generated {len(tokens)} tokens in {gen_time:.2f}s")
1038
+
1039
+ return ChatCompletionResponse(
1040
+ id=generate_id(),
1041
+ created=int(time.time()),
1042
+ model=request.model,
1043
+ choices=[
1044
+ ChatCompletionChoice(
1045
+ index=0,
1046
+ message=ChatMessage(role="assistant", content=response_text),
1047
+ finish_reason="stop"
1048
+ )
1049
+ ],
1050
+ usage={
1051
+ "prompt_tokens": 0, # Would need to track input tokens
1052
+ "completion_tokens": len(tokens),
1053
+ "total_tokens": len(tokens)
1054
+ }
1055
+ )
1056
+
1057
+ except Exception as e:
1058
+ logger.error(f"Vision completion error: {e}")
1059
+ raise HTTPException(status_code=500, detail=str(e))
1060
+
1061
+
1062
+ @app.post("/v1/chat/completions")
1063
+ async def chat_completions(request: ChatCompletionRequest):
1064
+ """Text-only chat completion (for compatibility)."""
1065
+ if not model_manager.is_loaded:
1066
+ raise HTTPException(status_code=503, detail="Model not loaded")
1067
+
1068
+ # Convert to vision request format (no images)
1069
+ vision_messages = [
1070
+ VisionMessage(role=m.role, content=m.content)
1071
+ for m in request.messages
1072
+ ]
1073
+
1074
+ vision_request = VisionCompletionRequest(
1075
+ model=request.model,
1076
+ messages=vision_messages,
1077
+ temperature=request.temperature,
1078
+ top_p=request.top_p,
1079
+ top_k=request.top_k,
1080
+ max_tokens=request.max_tokens,
1081
+ stream=request.stream
1082
+ )
1083
+
1084
+ return await vision_completions(vision_request)
1085
+
1086
+
1087
+ @app.post("/v1/vision/upload")
1088
+ async def upload_image(
1089
+ file: UploadFile = File(...),
1090
+ prompt: str = "What is in this image?"
1091
+ ):
1092
+ """Direct image upload endpoint."""
1093
+ if not model_manager.is_loaded:
1094
+ raise HTTPException(status_code=503, detail="Model not loaded")
1095
+
1096
+ # Validate file type
1097
+ content_type = file.content_type or ""
1098
+ file_ext = Path(file.filename or "").suffix.lower().lstrip(".")
1099
+
1100
+ if file_ext not in settings.supported_formats and not any(fmt in content_type for fmt in settings.supported_formats):
1101
+ raise HTTPException(
1102
+ status_code=400,
1103
+ detail=f"Unsupported image format. Supported: {settings.supported_formats}"
1104
+ )
1105
+
1106
+ # Read and process image
1107
+ contents = await file.read()
1108
+ if len(contents) > settings.max_image_size_mb * 1024 * 1024:
1109
+ raise HTTPException(
1110
+ status_code=400,
1111
+ detail=f"Image too large. Max size: {settings.max_image_size_mb}MB"
1112
+ )
1113
+
1114
+ try:
1115
+ image = Image.open(io.BytesIO(contents))
1116
+ except Exception as e:
1117
+ raise HTTPException(status_code=400, detail=f"Invalid image: {e}")
1118
+
1119
+ # Create request
1120
+ messages = [{
1121
+ "role": "user",
1122
+ "content": [
1123
+ {"type": "image"},
1124
+ {"type": "text", "text": prompt}
1125
+ ]
1126
+ }]
1127
+
1128
+ tokenizer = model_manager.processor.tokenizer
1129
+
1130
+ tokens = model_manager.generate(
1131
+ [image],
1132
+ messages,
1133
+ max_tokens=settings.max_tokens,
1134
+ temperature=settings.temperature,
1135
+ top_k=settings.top_k,
1136
+ top_p=settings.top_p
1137
+ )
1138
+
1139
+ response_text = tokenizer.decode(tokens, skip_special_tokens=True)
1140
+
1141
+ return {
1142
+ "id": generate_id(),
1143
+ "model": "lfm-vision",
1144
+ "response": response_text
1145
+ }
1146
+
1147
+
1148
+ # ==============================================================================
1149
+ # Run Server
1150
+ # ==============================================================================
1151
+
1152
+ if __name__ == "__main__":
1153
+ import uvicorn
1154
+
1155
+ logger.info(f"Starting server on {settings.host}:{settings.port}")
1156
+
1157
+ uvicorn.run(
1158
+ "app:app",
1159
+ host=settings.host,
1160
+ port=settings.port,
1161
+ reload=False,
1162
+ log_level=settings.log_level
1163
+ )