Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import subprocess
|
|
| 8 |
### monkey patch
|
| 9 |
|
| 10 |
import llama_cpp._internals as internals
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
# 2️⃣ Monkey patch BEFORE creating Llama()
|
|
@@ -235,6 +236,126 @@ llm_model_qwen= None
|
|
| 235 |
|
| 236 |
|
| 237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
|
| 240 |
|
|
@@ -267,11 +388,15 @@ def respond(
|
|
| 267 |
flash_attn=True,
|
| 268 |
n_gpu_layers=-1,
|
| 269 |
n_batch=2048, # increase
|
| 270 |
-
n_ctx=
|
| 271 |
n_threads=16, # set to your CPU cores
|
| 272 |
use_mlock=True,
|
| 273 |
verbose=True,
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
)
|
| 276 |
|
| 277 |
x=llm_model_qwen.create_chat_completion(
|
|
@@ -293,7 +418,7 @@ def respond(
|
|
| 293 |
flash_attn=True,
|
| 294 |
n_gpu_layers=-1,
|
| 295 |
n_batch=2048, # increase
|
| 296 |
-
n_ctx=
|
| 297 |
n_threads=16, # set to your CPU cores
|
| 298 |
use_mlock=True,
|
| 299 |
verbose=True,
|
|
|
|
| 8 |
### monkey patch
|
| 9 |
|
| 10 |
import llama_cpp._internals as internals
|
| 11 |
+
from llama_cpp.llama_chat_format import Qwen3VLChatHandler
|
| 12 |
|
| 13 |
|
| 14 |
# 2️⃣ Monkey patch BEFORE creating Llama()
|
|
|
|
| 236 |
|
| 237 |
|
| 238 |
|
| 239 |
+
_IMAGE_MIME_TYPES = {
|
| 240 |
+
# Most common formats
|
| 241 |
+
'.png': 'image/png',
|
| 242 |
+
'.jpg': 'image/jpeg',
|
| 243 |
+
'.jpeg': 'image/jpeg',
|
| 244 |
+
'.gif': 'image/gif',
|
| 245 |
+
'.webp': 'image/webp',
|
| 246 |
+
|
| 247 |
+
# Next-generation formats
|
| 248 |
+
'.avif': 'image/avif',
|
| 249 |
+
'.jp2': 'image/jp2',
|
| 250 |
+
'.j2k': 'image/jp2',
|
| 251 |
+
'.jpx': 'image/jp2',
|
| 252 |
+
|
| 253 |
+
# Legacy / Windows formats
|
| 254 |
+
'.bmp': 'image/bmp',
|
| 255 |
+
'.ico': 'image/x-icon',
|
| 256 |
+
'.pcx': 'image/x-pcx',
|
| 257 |
+
'.tga': 'image/x-tga',
|
| 258 |
+
'.icns': 'image/icns',
|
| 259 |
+
|
| 260 |
+
# Professional / Scientific imaging
|
| 261 |
+
'.tif': 'image/tiff',
|
| 262 |
+
'.tiff': 'image/tiff',
|
| 263 |
+
'.eps': 'application/postscript',
|
| 264 |
+
'.dds': 'image/vnd-ms.dds',
|
| 265 |
+
'.dib': 'image/dib',
|
| 266 |
+
'.sgi': 'image/sgi',
|
| 267 |
+
|
| 268 |
+
# Portable Map formats (PPM/PGM/PBM)
|
| 269 |
+
'.pbm': 'image/x-portable-bitmap',
|
| 270 |
+
'.pgm': 'image/x-portable-graymap',
|
| 271 |
+
'.ppm': 'image/x-portable-pixmap',
|
| 272 |
+
|
| 273 |
+
# Miscellaneous / Older formats
|
| 274 |
+
'.xbm': 'image/x-xbitmap',
|
| 275 |
+
'.mpo': 'image/mpo',
|
| 276 |
+
'.msp': 'image/msp',
|
| 277 |
+
'.im': 'image/x-pillow-im',
|
| 278 |
+
'.qoi': 'image/qoi',
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
def image_to_base64_data_uri(
|
| 282 |
+
file_path: str,
|
| 283 |
+
*,
|
| 284 |
+
fallback_mime: str = "application/octet-stream"
|
| 285 |
+
) -> str:
|
| 286 |
+
"""
|
| 287 |
+
Convert a local image file to a base64-encoded data URI with the correct MIME type.
|
| 288 |
+
|
| 289 |
+
Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.).
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
file_path: Path to the image file on disk.
|
| 293 |
+
fallback_mime: MIME type used when the file extension is unknown.
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
A valid data URI string (e.g., data:image/webp;base64,...).
|
| 297 |
+
|
| 298 |
+
Raises:
|
| 299 |
+
FileNotFoundError: If the file does not exist.
|
| 300 |
+
OSError: If reading the file fails.
|
| 301 |
+
"""
|
| 302 |
+
if not os.path.isfile(file_path):
|
| 303 |
+
raise FileNotFoundError(f"Image file not found: {file_path}")
|
| 304 |
+
|
| 305 |
+
extension = os.path.splitext(file_path)[1].lower()
|
| 306 |
+
mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)
|
| 307 |
+
|
| 308 |
+
if mime_type == fallback_mime:
|
| 309 |
+
print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
|
| 310 |
+
f"Using fallback MIME type: {fallback_mime}")
|
| 311 |
+
|
| 312 |
+
try:
|
| 313 |
+
with open(file_path, "rb") as img_file:
|
| 314 |
+
encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
|
| 315 |
+
except OSError as e:
|
| 316 |
+
raise OSError(f"Failed to read image file '{file_path}': {e}") from e
|
| 317 |
+
|
| 318 |
+
return f"data:{mime_type};base64,{encoded_data}"
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
###################### sample code ################################################
|
| 323 |
+
# --- Main Logic for Image Processing ---
|
| 324 |
+
|
| 325 |
+
# # 1. Create a list containing all image paths
|
| 326 |
+
# image_paths = [
|
| 327 |
+
# r'./scene.jpeg',
|
| 328 |
+
# r'./cat.png',
|
| 329 |
+
# r'./network.webp',
|
| 330 |
+
# # Add more image paths here if needed
|
| 331 |
+
# ]
|
| 332 |
+
|
| 333 |
+
# # 2. Create an empty list to store the message objects (images and text)
|
| 334 |
+
# images_messages = []
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
# # 3. Loop through the image path list, convert each image to a Data URI,
|
| 339 |
+
# # and add it to the message list as an image_url object.
|
| 340 |
+
# for path in image_paths:
|
| 341 |
+
# data_uri = image_to_base64_data_uri(path)
|
| 342 |
+
# images_messages.append({"type": "image_url", "image_url": {"url": data_uri}})
|
| 343 |
+
|
| 344 |
+
# # 4. Add the final text prompt at the end of the list
|
| 345 |
+
# images_messages.append({"type": "text", "text": "Describes the images."})
|
| 346 |
+
|
| 347 |
+
# # 5. Use this list to build the chat_completion request
|
| 348 |
+
# res = llm.create_chat_completion(
|
| 349 |
+
# messages=[
|
| 350 |
+
# {"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."},
|
| 351 |
+
# # The user's content is the list containing both images and text
|
| 352 |
+
# {"role": "user", "content": images_messages}
|
| 353 |
+
# ]
|
| 354 |
+
# )
|
| 355 |
+
|
| 356 |
+
# # Print the assistant's response
|
| 357 |
+
# print(res["choices"][0]["message"]["content"])
|
| 358 |
+
|
| 359 |
|
| 360 |
|
| 361 |
|
|
|
|
| 388 |
flash_attn=True,
|
| 389 |
n_gpu_layers=-1,
|
| 390 |
n_batch=2048, # increase
|
| 391 |
+
n_ctx= 8196, # reduce if you don’t need 8k
|
| 392 |
n_threads=16, # set to your CPU cores
|
| 393 |
use_mlock=True,
|
| 394 |
verbose=True,
|
| 395 |
+
chat_handler=Qwen3VLChatHandler(
|
| 396 |
+
clip_model_path=MMPROJ_PATH,
|
| 397 |
+
force_reasoning=True,
|
| 398 |
+
image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
|
| 399 |
+
),
|
| 400 |
)
|
| 401 |
|
| 402 |
x=llm_model_qwen.create_chat_completion(
|
|
|
|
| 418 |
flash_attn=True,
|
| 419 |
n_gpu_layers=-1,
|
| 420 |
n_batch=2048, # increase
|
| 421 |
+
n_ctx=8196, # reduce if you don’t need 8k
|
| 422 |
n_threads=16, # set to your CPU cores
|
| 423 |
use_mlock=True,
|
| 424 |
verbose=True,
|