Spaces:
Sleeping
Sleeping
Auto commit at 24-2025-08 18:22:22
Browse files
lily_llm_api/api/routers/generation_router.py
CHANGED
|
@@ -25,7 +25,11 @@ async def generate(request: Request,
|
|
| 25 |
user_id: str = Form("anonymous"),
|
| 26 |
room_id: str = Form("default"),
|
| 27 |
use_context: bool = Form(True),
|
| 28 |
-
session_id: str = Form(None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
if not is_model_loaded():
|
| 31 |
raise HTTPException(status_code=503, detail="๋ชจ๋ธ์ด ๋ก๋๋์ง ์์์ต๋๋ค.")
|
|
@@ -74,7 +78,7 @@ async def generate(request: Request,
|
|
| 74 |
|
| 75 |
try:
|
| 76 |
# generate_sync ํจ์ ํธ์ถ (์ปจํ
์คํธ ํฌํจ)
|
| 77 |
-
result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id)
|
| 78 |
|
| 79 |
if "error" in result:
|
| 80 |
raise HTTPException(status_code=500, detail=result["error"])
|
|
|
|
| 25 |
user_id: str = Form("anonymous"),
|
| 26 |
room_id: str = Form("default"),
|
| 27 |
use_context: bool = Form(True),
|
| 28 |
+
session_id: str = Form(None),
|
| 29 |
+
use_rag_images: bool = Form(False),
|
| 30 |
+
use_rag_text: bool = Form(False),
|
| 31 |
+
document_id: str = Form(None),
|
| 32 |
+
image_short_side: int = Form(None)):
|
| 33 |
|
| 34 |
if not is_model_loaded():
|
| 35 |
raise HTTPException(status_code=503, detail="๋ชจ๋ธ์ด ๋ก๋๋์ง ์์์ต๋๋ค.")
|
|
|
|
| 78 |
|
| 79 |
try:
|
| 80 |
# generate_sync ํจ์ ํธ์ถ (์ปจํ
์คํธ ํฌํจ)
|
| 81 |
+
result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id, use_rag_images=use_rag_images, use_rag_text=use_rag_text, document_id=document_id, image_short_side=image_short_side)
|
| 82 |
|
| 83 |
if "error" in result:
|
| 84 |
raise HTTPException(status_code=500, detail=result["error"])
|
lily_llm_api/services/generation_service.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
Generation service for Lily LLM API
|
| 3 |
"""
|
| 4 |
import logging
|
|
|
|
| 5 |
import time
|
| 6 |
from typing import Optional, List, Dict
|
| 7 |
from pathlib import Path
|
|
@@ -16,6 +17,11 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
# ์ฃผ์: ํ๋ก์ธ์ค ์ฌ์์ ์ ์ด๊ธฐํ๋จ. ์ต๋ 4์ฅ ๋ณด๊ด.
|
| 17 |
_session_image_cache: Dict[str, List[bytes]] = {}
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# ์ ํ์ : ๋ฒกํฐ ์คํ ์ด์์ ์ต๊ทผ ๋ฌธ์ ์ด๋ฏธ์ง ๋ณต๊ตฌ ์ง์
|
| 20 |
try:
|
| 21 |
from lily_llm_core.vector_store_manager import vector_store_manager, SimpleVectorStore
|
|
@@ -26,7 +32,9 @@ except Exception:
|
|
| 26 |
def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
|
| 27 |
temperature: Optional[float] = None, top_p: Optional[float] = None,
|
| 28 |
do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
|
| 29 |
-
user_id: str = "anonymous", room_id: str = "default"
|
|
|
|
|
|
|
| 30 |
"""[์ต์ ํ] ๋ชจ๋ธ ์์ฑ์ ์ฒ๋ฆฌํ๋ ํตํฉ ๋๊ธฐ ํจ์"""
|
| 31 |
try:
|
| 32 |
from .model_service import get_current_profile, get_current_model
|
|
@@ -92,8 +100,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 92 |
all_image_data.extend(cached_imgs)
|
| 93 |
print(f"๐ [DEBUG] ์ธ์
์บ์์์ ์ด์ ์ด๋ฏธ์ง {len(cached_imgs)}๊ฐ ๋ณต๊ตฌ (์ธ์
: {session_id})")
|
| 94 |
|
| 95 |
-
# ์ถ๊ฐ ๋ณต๊ตฌ: ์ฌ์ ํ ์ด๋ฏธ์ง๊ฐ ์๊ณ
|
| 96 |
-
if (not all_image_data or len([img for img in all_image_data if img]) == 0) and getattr(current_profile, 'multimodal', False):
|
| 97 |
try:
|
| 98 |
if vector_store_manager is not None:
|
| 99 |
# ์ฌ์ฉ์ ๋ฌธ์ ๋ชฉ๋ก ๊ฐ์ ธ์ค๊ธฐ (์ต์ ์ ์ ๋ ฌ)
|
|
@@ -129,9 +137,12 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 129 |
print("โ ๏ธ [DEBUG] vector_store_manager ๋ฏธ์ฌ์ฉ - ์ด๋ฏธ์ง ๋ณต๊ตฌ ๋นํ์ฑํ")
|
| 130 |
except Exception as e:
|
| 131 |
print(f"โ ๏ธ [DEBUG] RAG ๊ธฐ๋ฐ ์ด๋ฏธ์ง ๋ณต๊ตฌ ์คํจ: {e}")
|
|
|
|
|
|
|
| 132 |
|
| 133 |
# ํญ์ ์ฐธ์กฐ ๊ฐ๋ฅํ max_images ์ ์ (์ด๋ฏธ์ง ์์ผ๋ฉด 0)
|
| 134 |
-
|
|
|
|
| 135 |
|
| 136 |
if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
|
| 137 |
print(f"๐ [DEBUG] ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์์ - ์ด ์ด๋ฏธ์ง ๊ฐ์: {len([img for img in all_image_data if img])}")
|
|
@@ -146,6 +157,22 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 146 |
if image_bytes:
|
| 147 |
try:
|
| 148 |
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
# ๐ ๊ณต์ ์ด๋ฏธ์ง ํ๋ก์ธ์ ์ฌ์ฉ
|
| 150 |
if processor and hasattr(processor, 'image_processor'):
|
| 151 |
processed = processor.image_processor(pil_image)
|
|
@@ -169,6 +196,22 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 169 |
if image_bytes:
|
| 170 |
try:
|
| 171 |
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
# ๐ ๊ณต์ ์ด๋ฏธ์ง ํ๋ก์ธ์ ์ฌ์ฉ
|
| 173 |
if processor and hasattr(processor, 'image_processor'):
|
| 174 |
# KananaVImageProcessor๋ ๊ธฐ๋ณธ ํ๋ผ๋ฏธํฐ๋ง ์ง์
|
|
@@ -189,10 +232,71 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 189 |
logger.error(f"โ ์ด๋ฏธ์ง ์ ์ฒ๋ฆฌ ์คํจ: {e}")
|
| 190 |
combined_image_metas = {}
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
# --- 2. ํ๋กฌํํธ ๊ตฌ์ฑ ---
|
| 193 |
print(f"๐ [DEBUG] ํ๋กฌํํธ ๊ตฌ์ฑ ์์")
|
| 194 |
|
| 195 |
-
# ์ปจํ
์คํธ ํตํฉ (๋ํ ๊ธฐ๋ก + RAG ๊ฒ์
|
| 196 |
context_prompt = ""
|
| 197 |
if use_context and session_id:
|
| 198 |
try:
|
|
@@ -209,6 +313,34 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 209 |
except Exception as e:
|
| 210 |
print(f"โ ๏ธ [DEBUG] ์ปจํ
์คํธ ๋ก๋ ์คํจ: {e}")
|
| 211 |
context_prompt = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
# ์ด๋ฏธ์ง ๋ฐ์ดํธ๋ฅผ ์ธ์
์บ์์ ๋ณด๊ด (๋ค์ ํด์ ์ฌ์ฌ์ฉ)
|
| 213 |
if session_id:
|
| 214 |
# ์๋ณธ ์์ฒญ์ ์ด๋ฏธ์ง๊ฐ ์์๋ค๋ฉด ๊ทธ๊ฑธ ์ฐ์ ๋ณด๊ด, ์์ผ๋ฉด ๋ณต๊ตฌ๋ ์ด๋ฏธ์ง ์ ์ง
|
|
@@ -226,6 +358,16 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 226 |
|
| 227 |
# formatted_prompt ์ด๊ธฐํ
|
| 228 |
formatted_prompt = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
# ๐ ๋ฉํฐ๋ชจ๋ฌ ํ๋กฌํํธ ๊ตฌ์ฑ (๊ณต์ ๋ฐฉ์)
|
| 231 |
if all_pixel_values and len(all_pixel_values) > 0:
|
|
@@ -233,6 +375,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 233 |
num_images = len(all_pixel_values)
|
| 234 |
image_tokens = "<image>" * num_images # ์ด๋ฏธ์ง ๊ฐ์๋งํผ <image> ํ ํฐ ์์ฑ
|
| 235 |
# ๋ต๋ณ ์ ๋๋ฅผ ์ํด Assistant ํ๋ฆฌํฝ์ค ์ถ๊ฐ
|
|
|
|
| 236 |
formatted_prompt = f"Human: {image_tokens}{prompt}\nAssistant:"
|
| 237 |
print(f"๐ [DEBUG] ๋ฉํฐ๋ชจ๋ฌ ํ๋กฌํํธ ๊ตฌ์ฑ (๊ณต์ ํ์): {formatted_prompt}")
|
| 238 |
print(f"๐ [DEBUG] ์ด๋ฏธ์ง ํ ํฐ ์์ฑ: {num_images}๊ฐ ์ด๋ฏธ์ง -> {image_tokens}")
|
|
@@ -400,20 +543,68 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 400 |
# ๐ ์ต์ข
๊ฒ์ฆ
|
| 401 |
print(f"๐ [DEBUG] ์ต์ข
input_ids ํ์
: {type(input_ids)}, shape: {input_ids.shape}")
|
| 402 |
print(f"๐ [DEBUG] ์ต์ข
attention_mask ํ์
: {type(attention_mask)}, shape: {attention_mask.shape}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
except Exception as e:
|
| 404 |
-
print(f"โ [DEBUG] encode_prompt ์คํจ: {e}
|
| 405 |
-
#
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
else:
|
| 418 |
# ์์ ํด๋ฐฑ
|
| 419 |
print(f"๐ [DEBUG] ๊ธฐ๋ณธ ํ ํฌ๋์ด์ ์ฌ์ฉ (ํด๋ฐฑ)")
|
|
@@ -586,25 +777,15 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 586 |
else:
|
| 587 |
processed_image_metas[key] = value
|
| 588 |
|
| 589 |
-
#
|
| 590 |
-
|
| 591 |
-
image_token_thw
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
if len(token_info) == 3:
|
| 599 |
-
t, h, w = token_info
|
| 600 |
-
total_image_tokens += t * h * w
|
| 601 |
-
elif len(token_info) == 2:
|
| 602 |
-
h, w = token_info
|
| 603 |
-
total_image_tokens += h * w
|
| 604 |
-
print(f"๐ [DEBUG] ๊ณ์ฐ๋ ์ด ์ด๋ฏธ์ง ํ ํฐ ์(์ฐธ๊ณ ): {total_image_tokens}")
|
| 605 |
-
if isinstance(total_image_tokens, torch.Tensor):
|
| 606 |
-
total_image_tokens = total_image_tokens.sum().item()
|
| 607 |
-
print(f"๐ [DEBUG] pixel_values ๊ธธ์ด: {pixel_values.shape[0]}, ์์: {total_image_tokens} (์กฐ์ ์ํจ)")
|
| 608 |
|
| 609 |
# ์์ ๊ฐ๋: vision_grid_thw๊ฐ [1, N, 3]๋ก ์ค๋ฉด [N, 3]๋ก ๋ณํ
|
| 610 |
try:
|
|
|
|
| 2 |
Generation service for Lily LLM API
|
| 3 |
"""
|
| 4 |
import logging
|
| 5 |
+
import os
|
| 6 |
import time
|
| 7 |
from typing import Optional, List, Dict
|
| 8 |
from pathlib import Path
|
|
|
|
| 17 |
# ์ฃผ์: ํ๋ก์ธ์ค ์ฌ์์ ์ ์ด๊ธฐํ๋จ. ์ต๋ 4์ฅ ๋ณด๊ด.
|
| 18 |
_session_image_cache: Dict[str, List[bytes]] = {}
|
| 19 |
|
| 20 |
+
# ์๋ ๋ฆฌ์ฌ์ด์ฆ ๊ฐ์ ์ค์ (์ฝ๋์์ ์ง์ ์์ ์ฉ). ์: 128, 256, 512 ... None์ด๋ฉด ๋นํ์ฑํ
|
| 21 |
+
DEFAULT_IMAGE_SHORT_SIDE: Optional[int] = 128
|
| 22 |
+
|
| 23 |
+
max_images_limit = 4
|
| 24 |
+
|
| 25 |
# ์ ํ์ : ๋ฒกํฐ ์คํ ์ด์์ ์ต๊ทผ ๋ฌธ์ ์ด๋ฏธ์ง ๋ณต๊ตฌ ์ง์
|
| 26 |
try:
|
| 27 |
from lily_llm_core.vector_store_manager import vector_store_manager, SimpleVectorStore
|
|
|
|
| 32 |
def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
|
| 33 |
temperature: Optional[float] = None, top_p: Optional[float] = None,
|
| 34 |
do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
|
| 35 |
+
user_id: str = "anonymous", room_id: str = "default", use_rag_images: bool = False,
|
| 36 |
+
use_rag_text: bool = False, document_id: Optional[str] = None,
|
| 37 |
+
image_short_side: Optional[int] = None) -> dict:
|
| 38 |
"""[์ต์ ํ] ๋ชจ๋ธ ์์ฑ์ ์ฒ๋ฆฌํ๋ ํตํฉ ๋๊ธฐ ํจ์"""
|
| 39 |
try:
|
| 40 |
from .model_service import get_current_profile, get_current_model
|
|
|
|
| 100 |
all_image_data.extend(cached_imgs)
|
| 101 |
print(f"๐ [DEBUG] ์ธ์
์บ์์์ ์ด์ ์ด๋ฏธ์ง {len(cached_imgs)}๊ฐ ๋ณต๊ตฌ (์ธ์
: {session_id})")
|
| 102 |
|
| 103 |
+
# ์ถ๊ฐ ๋ณต๊ตฌ: ์ฌ์ ํ ์ด๋ฏธ์ง๊ฐ ์๊ณ ๋ฉํฐ๋ชจ๋ฌ์ด๋ฉฐ, ๋ช
์์ ์ผ๋ก ํ์ฉ๋ ๊ฒฝ์ฐ์๋ง RAG์์ ์ด๋ฏธ์ง ๋ณต์
|
| 104 |
+
if use_rag_images and (not all_image_data or len([img for img in all_image_data if img]) == 0) and getattr(current_profile, 'multimodal', False):
|
| 105 |
try:
|
| 106 |
if vector_store_manager is not None:
|
| 107 |
# ์ฌ์ฉ์ ๋ฌธ์ ๋ชฉ๋ก ๊ฐ์ ธ์ค๊ธฐ (์ต์ ์ ์ ๋ ฌ)
|
|
|
|
| 137 |
print("โ ๏ธ [DEBUG] vector_store_manager ๋ฏธ์ฌ์ฉ - ์ด๋ฏธ์ง ๋ณต๊ตฌ ๋นํ์ฑํ")
|
| 138 |
except Exception as e:
|
| 139 |
print(f"โ ๏ธ [DEBUG] RAG ๊ธฐ๋ฐ ์ด๋ฏธ์ง ๋ณต๊ตฌ ์คํจ: {e}")
|
| 140 |
+
elif not use_rag_images and getattr(current_profile, 'multimodal', False):
|
| 141 |
+
print("๐ [DEBUG] RAG ์ด๋ฏธ์ง ๋ณต๊ตฌ ๋นํ์ฑํ๋จ(use_rag_images=False) - ํ
์คํธ ์ ์ฉ ์ ์ง")
|
| 142 |
|
| 143 |
# ํญ์ ์ฐธ์กฐ ๊ฐ๋ฅํ max_images ์ ์ (์ด๋ฏธ์ง ์์ผ๋ฉด 0)
|
| 144 |
+
# 1์ฐจ ์ํ์ 4์ฅ์ผ๋ก ์ ํ (์ต์ข
์ ํ์ ์์ฐ ๊ธฐ๋ฐ ๋์ ์ ํ์์ ๊ฒฐ์ )
|
| 145 |
+
max_images = min(len([img for img in all_image_data if img]) if all_image_data else 0, max_images_limit)
|
| 146 |
|
| 147 |
if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
|
| 148 |
print(f"๐ [DEBUG] ์ด๋ฏธ์ง ์ฒ๋ฆฌ ์์ - ์ด ์ด๋ฏธ์ง ๊ฐ์: {len([img for img in all_image_data if img])}")
|
|
|
|
| 157 |
if image_bytes:
|
| 158 |
try:
|
| 159 |
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 160 |
+
# ์๋ ๋ฆฌ์ฌ์ด์ฆ ์ฐ์ ์์: API ํ๋ผ๋ฏธํฐ > ์ฝ๋ ์์ > ํ๊ฒฝ๋ณ์
|
| 161 |
+
env_short_side = os.getenv('LILY_IMAGE_SHORT_SIDE')
|
| 162 |
+
try:
|
| 163 |
+
env_short_side_val = int(env_short_side) if env_short_side is not None else None
|
| 164 |
+
except Exception:
|
| 165 |
+
env_short_side_val = None
|
| 166 |
+
effective_short_side = image_short_side or DEFAULT_IMAGE_SHORT_SIDE or env_short_side_val
|
| 167 |
+
if effective_short_side:
|
| 168 |
+
s = max(128, min(int(effective_short_side), 2048))
|
| 169 |
+
w, h = pil_image.size
|
| 170 |
+
if w > 0 and h > 0:
|
| 171 |
+
if w <= h:
|
| 172 |
+
new_w = s; new_h = int(h * (s / w))
|
| 173 |
+
else:
|
| 174 |
+
new_h = s; new_w = int(w * (s / h))
|
| 175 |
+
pil_image = pil_image.resize((max(1, new_w), max(1, new_h)))
|
| 176 |
# ๐ ๊ณต์ ์ด๋ฏธ์ง ํ๋ก์ธ์ ์ฌ์ฉ
|
| 177 |
if processor and hasattr(processor, 'image_processor'):
|
| 178 |
processed = processor.image_processor(pil_image)
|
|
|
|
| 196 |
if image_bytes:
|
| 197 |
try:
|
| 198 |
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 199 |
+
# ์๋ ๋ฆฌ์ฌ์ด์ฆ ์ฐ์ ์์: API ํ๋ผ๋ฏธํฐ > ์ฝ๋ ์์ > ํ๊ฒฝ๋ณ์
|
| 200 |
+
env_short_side = os.getenv('LILY_IMAGE_SHORT_SIDE')
|
| 201 |
+
try:
|
| 202 |
+
env_short_side_val = int(env_short_side) if env_short_side is not None else None
|
| 203 |
+
except Exception:
|
| 204 |
+
env_short_side_val = None
|
| 205 |
+
effective_short_side = image_short_side or DEFAULT_IMAGE_SHORT_SIDE or env_short_side_val
|
| 206 |
+
if effective_short_side:
|
| 207 |
+
s = max(128, min(int(effective_short_side), 2048))
|
| 208 |
+
w, h = pil_image.size
|
| 209 |
+
if w > 0 and h > 0:
|
| 210 |
+
if w <= h:
|
| 211 |
+
new_w = s; new_h = int(h * (s / w))
|
| 212 |
+
else:
|
| 213 |
+
new_h = s; new_w = int(w * (s / h))
|
| 214 |
+
pil_image = pil_image.resize((max(1, new_w), max(1, new_h)))
|
| 215 |
# ๐ ๊ณต์ ์ด๋ฏธ์ง ํ๋ก์ธ์ ์ฌ์ฉ
|
| 216 |
if processor and hasattr(processor, 'image_processor'):
|
| 217 |
# KananaVImageProcessor๋ ๊ธฐ๋ณธ ํ๋ผ๋ฏธํฐ๋ง ์ง์
|
|
|
|
| 232 |
logger.error(f"โ ์ด๋ฏธ์ง ์ ์ฒ๋ฆฌ ์คํจ: {e}")
|
| 233 |
combined_image_metas = {}
|
| 234 |
|
| 235 |
+
# ๐ง ์ด๋ฏธ์ง ํ ํฐ ์์ฐ ๊ธฐ๋ฐ ๋์ ์ ํ (๋ฉํฐ๋ชจ๋ฌ ๊ธธ์ด ์ด๊ณผ ๋ฐฉ์ง)
|
| 236 |
+
try:
|
| 237 |
+
# 1) ์ด๋ฏธ์ง๋ณ ํ ํฐ ์ ์ฐ์ถ
|
| 238 |
+
per_image_tokens: List[int] = []
|
| 239 |
+
if isinstance(combined_image_metas, dict) and 'image_token_thw' in combined_image_metas:
|
| 240 |
+
for thw in combined_image_metas['image_token_thw']:
|
| 241 |
+
if isinstance(thw, (list, tuple)) and len(thw) == 3:
|
| 242 |
+
per_image_tokens.append(int(thw[0]) * int(thw[1]) * int(thw[2]))
|
| 243 |
+
else:
|
| 244 |
+
per_image_tokens.append(0)
|
| 245 |
+
else:
|
| 246 |
+
# ๋ฉํ๊ฐ ์์ผ๋ฉด ๋ณด์์ ์ผ๋ก ํฐ ๊ฐ์ผ๋ก ๊ฐ์ฃผํ์ฌ ํ
์คํธ-only๋ก ์ ๋
|
| 247 |
+
per_image_tokens = [3000] * len(all_pixel_values)
|
| 248 |
+
|
| 249 |
+
# 2) ํ
์คํธ ๊ธธ์ด ์ธก์ (์ด๋ฏธ์ง ํ ํฐ ์ ์ธํ ํ๋กฌํํธ)
|
| 250 |
+
base_text_prompt = f"Human: {prompt}\nAssistant:"
|
| 251 |
+
text_inputs = tokenizer(
|
| 252 |
+
base_text_prompt,
|
| 253 |
+
return_tensors="pt",
|
| 254 |
+
padding=False,
|
| 255 |
+
truncation=True,
|
| 256 |
+
max_length=2048,
|
| 257 |
+
)
|
| 258 |
+
text_len = int(text_inputs['input_ids'].shape[-1]) if 'input_ids' in text_inputs else 0
|
| 259 |
+
|
| 260 |
+
# 3) ์ด๋ฏธ์ง ํ ํฐ ์์ฐ ๊ณ์ฐ (์ฌ์ ๋ง์ง 16)
|
| 261 |
+
total_budget = 2048
|
| 262 |
+
margin = 16
|
| 263 |
+
allowed_image_tokens = max(0, total_budget - text_len - margin)
|
| 264 |
+
print(f"๐ [DEBUG] ํ ํฐ ์์ฐ: text={text_len}, allowed_image={allowed_image_tokens}")
|
| 265 |
+
|
| 266 |
+
# 4) ์์ฐ ๋ด ์ต๋ ์ด๋ฏธ์ง ์ ์ ํ (์์์๋ถํฐ ๊ทธ๋ฆฌ๋)
|
| 267 |
+
selected_indices: List[int] = []
|
| 268 |
+
cum = 0
|
| 269 |
+
for i, tok in enumerate(per_image_tokens):
|
| 270 |
+
if cum + tok <= allowed_image_tokens:
|
| 271 |
+
selected_indices.append(i)
|
| 272 |
+
cum += tok
|
| 273 |
+
else:
|
| 274 |
+
break
|
| 275 |
+
|
| 276 |
+
# 5) ์ ์ฉ: ์ ํ๋ ์ด๋ฏธ์ง๋ง ์ ์ง
|
| 277 |
+
if selected_indices and len(selected_indices) < len(all_pixel_values):
|
| 278 |
+
print(f"๐ง [DEBUG] ์ด๋ฏธ์ง ์ ํ: {len(all_pixel_values)} -> {len(selected_indices)} (cum_tokens={cum})")
|
| 279 |
+
all_pixel_values = [all_pixel_values[i] for i in selected_indices]
|
| 280 |
+
if isinstance(combined_image_metas, dict):
|
| 281 |
+
for key in list(combined_image_metas.keys()):
|
| 282 |
+
try:
|
| 283 |
+
combined_image_metas[key] = [combined_image_metas[key][i] for i in selected_indices]
|
| 284 |
+
except Exception:
|
| 285 |
+
pass
|
| 286 |
+
# ์ต์ข
max_images ๊ฐฑ์
|
| 287 |
+
max_images = len(all_pixel_values)
|
| 288 |
+
elif not selected_indices:
|
| 289 |
+
print("โ ๏ธ [DEBUG] ์ด๋ฏธ์ง ์์ฐ ๋ถ์กฑ โ ํ
์คํธ-only๋ก ์ ํ")
|
| 290 |
+
all_pixel_values = []
|
| 291 |
+
combined_image_metas = {}
|
| 292 |
+
max_images = 0
|
| 293 |
+
except Exception as _e_budget:
|
| 294 |
+
print(f"โ ๏ธ [DEBUG] ์ด๋ฏธ์ง ์์ฐ ๊ณ์ฐ ์คํจ: {_e_budget}")
|
| 295 |
+
|
| 296 |
# --- 2. ํ๋กฌํํธ ๊ตฌ์ฑ ---
|
| 297 |
print(f"๐ [DEBUG] ํ๋กฌํํธ ๊ตฌ์ฑ ์์")
|
| 298 |
|
| 299 |
+
# ์ปจํ
์คํธ ํตํฉ (๋ํ ๊ธฐ๋ก + ์ ํ์ RAG ํ
์คํธ ๊ฒ์) - ๋ชจ๋ธ๋ณ ์ต์ ํ
|
| 300 |
context_prompt = ""
|
| 301 |
if use_context and session_id:
|
| 302 |
try:
|
|
|
|
| 313 |
except Exception as e:
|
| 314 |
print(f"โ ๏ธ [DEBUG] ์ปจํ
์คํธ ๋ก๋ ์คํจ: {e}")
|
| 315 |
context_prompt = ""
|
| 316 |
+
|
| 317 |
+
# ์ ํ์ RAG ํ
์คํธ ๊ฒ์ ํตํฉ
|
| 318 |
+
if use_rag_text:
|
| 319 |
+
try:
|
| 320 |
+
from lily_llm_core.vector_store_manager import vector_store_manager as _vsm
|
| 321 |
+
# ๋์ ๋ฌธ์ ์ ํ: ๋ช
์์ document_id > ์ต์ ๋ฌธ์
|
| 322 |
+
target_doc_id = document_id
|
| 323 |
+
if not target_doc_id:
|
| 324 |
+
user_docs = _vsm.get_user_documents(user_id)
|
| 325 |
+
if user_docs:
|
| 326 |
+
user_docs.sort(key=lambda d: d.get('last_updated') or d.get('created_at') or 0, reverse=True)
|
| 327 |
+
target_doc_id = user_docs[0].get('document_id')
|
| 328 |
+
# ๊ฒ์
|
| 329 |
+
if target_doc_id:
|
| 330 |
+
docs = _vsm.search_similar(user_id, target_doc_id, prompt, k=3)
|
| 331 |
+
rag_contexts = []
|
| 332 |
+
for d in docs:
|
| 333 |
+
try:
|
| 334 |
+
preview = getattr(d, 'page_content', '')
|
| 335 |
+
rag_contexts.append(preview)
|
| 336 |
+
except Exception:
|
| 337 |
+
continue
|
| 338 |
+
if rag_contexts:
|
| 339 |
+
rag_text = "\n\n".join(rag_contexts)
|
| 340 |
+
context_prompt = (context_prompt or "") + f"[RAG]\n{rag_text}\n\n"
|
| 341 |
+
print(f"๐ [DEBUG] RAG ํ
์คํธ ์ปจํ
์คํธ ํตํฉ: {len(rag_text)}์, ๋ฌธ์={target_doc_id}")
|
| 342 |
+
except Exception as _re:
|
| 343 |
+
print(f"โ ๏ธ [DEBUG] RAG ํ
์คํธ ๊ฒ์ ์คํจ: {_re}")
|
| 344 |
# ์ด๋ฏธ์ง ๋ฐ์ดํธ๋ฅผ ์ธ์
์บ์์ ๋ณด๊ด (๋ค์ ํด์ ์ฌ์ฌ์ฉ)
|
| 345 |
if session_id:
|
| 346 |
# ์๋ณธ ์์ฒญ์ ์ด๋ฏธ์ง๊ฐ ์์๋ค๋ฉด ๊ทธ๊ฑธ ์ฐ์ ๋ณด๊ด, ์์ผ๋ฉด ๋ณต๊ตฌ๋ ์ด๋ฏธ์ง ์ ์ง
|
|
|
|
| 358 |
|
| 359 |
# formatted_prompt ์ด๊ธฐํ
|
| 360 |
formatted_prompt = None
|
| 361 |
+
# ๋ฉํฐ๋ชจ๋ฌ์ฉ RAG ์ค๋ํซ(๊ธธ์ด ์ ํ) ์ค๋น
|
| 362 |
+
rag_snippet_short = ""
|
| 363 |
+
if context_prompt and isinstance(context_prompt, str):
|
| 364 |
+
try:
|
| 365 |
+
# ๊ณผ๋ํ ๊ธธ์ด ๋ฐฉ์ง: ์ฐ์ 256์๋ก ์ ํ
|
| 366 |
+
rag_snippet_short = context_prompt[:256]
|
| 367 |
+
if not rag_snippet_short.endswith("\n"):
|
| 368 |
+
rag_snippet_short += "\n"
|
| 369 |
+
except Exception:
|
| 370 |
+
rag_snippet_short = ""
|
| 371 |
|
| 372 |
# ๐ ๋ฉํฐ๋ชจ๋ฌ ํ๋กฌํํธ ๊ตฌ์ฑ (๊ณต์ ๋ฐฉ์)
|
| 373 |
if all_pixel_values and len(all_pixel_values) > 0:
|
|
|
|
| 375 |
num_images = len(all_pixel_values)
|
| 376 |
image_tokens = "<image>" * num_images # ์ด๋ฏธ์ง ๊ฐ์๋งํผ <image> ํ ํฐ ์์ฑ
|
| 377 |
# ๋ต๋ณ ์ ๋๋ฅผ ์ํด Assistant ํ๋ฆฌํฝ์ค ์ถ๊ฐ
|
| 378 |
+
# ๊ธธ์ด ์ด๊ณผ๋ฅผ ๋ฐฉ์งํ๊ธฐ ์ํด ๋ฉํฐ๋ชจ๋ฌ ๊ฒฝ๋ก์์๋ ์ฌ์ฉ์ ์
๋ ฅ๋ง ํฌํจ
|
| 379 |
formatted_prompt = f"Human: {image_tokens}{prompt}\nAssistant:"
|
| 380 |
print(f"๐ [DEBUG] ๋ฉํฐ๋ชจ๋ฌ ํ๋กฌํํธ ๊ตฌ์ฑ (๊ณต์ ํ์): {formatted_prompt}")
|
| 381 |
print(f"๐ [DEBUG] ์ด๋ฏธ์ง ํ ํฐ ์์ฑ: {num_images}๊ฐ ์ด๋ฏธ์ง -> {image_tokens}")
|
|
|
|
| 543 |
# ๐ ์ต์ข
๊ฒ์ฆ
|
| 544 |
print(f"๐ [DEBUG] ์ต์ข
input_ids ํ์
: {type(input_ids)}, shape: {input_ids.shape}")
|
| 545 |
print(f"๐ [DEBUG] ์ต์ข
attention_mask ํ์
: {type(attention_mask)}, shape: {attention_mask.shape}")
|
| 546 |
+
|
| 547 |
+
# -1 ํ ํฐ(์ด๋ฏธ์ง ์๋ฆฌํ์์) ์กด์ฌ ๊ฒ์ฆ ๋ฐ ๋จ๊ณ์ ์ฌ์๋
|
| 548 |
+
try:
|
| 549 |
+
neg_exists = (input_ids == -1).any().item() if hasattr(input_ids, 'any') else False
|
| 550 |
+
except Exception:
|
| 551 |
+
neg_exists = False
|
| 552 |
+
if not neg_exists and len(all_pixel_values) > 0:
|
| 553 |
+
print("โ ๏ธ [DEBUG] -1 ํ ํฐ ์์ โ RAG ์ค๋ํซ ๊ธธ์ด ์ค์ฌ ์ฌ์๋")
|
| 554 |
+
for limit in [128, 64, 0]:
|
| 555 |
+
try:
|
| 556 |
+
base_snippet = (context_prompt or "")[:limit]
|
| 557 |
+
if base_snippet and not base_snippet.endswith("\n"):
|
| 558 |
+
base_snippet += "\n"
|
| 559 |
+
base_prompt_retry = f"Human: {'<image>' * len(all_pixel_values)}{base_snippet}{prompt}\nAssistant:"
|
| 560 |
+
print(f"๐ [DEBUG] ์ฌ์๋ limit={limit}: {base_prompt_retry}")
|
| 561 |
+
inputs_retry = tokenizer.encode_prompt(
|
| 562 |
+
prompt=base_prompt_retry,
|
| 563 |
+
max_length=2048,
|
| 564 |
+
image_meta=final_meta
|
| 565 |
+
)
|
| 566 |
+
# ์ ๊ทํ
|
| 567 |
+
if 'seq_length' in inputs_retry:
|
| 568 |
+
del inputs_retry['seq_length']
|
| 569 |
+
_input_ids = inputs_retry['input_ids'][0] if isinstance(inputs_retry['input_ids'], tuple) else inputs_retry['input_ids']
|
| 570 |
+
_neg = (_input_ids == -1).any().item() if hasattr(_input_ids, 'any') else False
|
| 571 |
+
if _neg:
|
| 572 |
+
inputs = inputs_retry
|
| 573 |
+
input_ids = _input_ids
|
| 574 |
+
attention_mask = inputs_retry['attention_mask'][0] if isinstance(inputs_retry['attention_mask'], tuple) else inputs_retry['attention_mask']
|
| 575 |
+
formatted_prompt = base_prompt_retry
|
| 576 |
+
print("โ
[DEBUG] ์ฌ์๋ ์ฑ๊ณต: -1 ํ ํฐ ํ๋ณด")
|
| 577 |
+
break
|
| 578 |
+
except Exception as _re_try:
|
| 579 |
+
print(f"โ ๏ธ [DEBUG] ์ฌ์๋ ์คํจ(limit={limit}): {_re_try}")
|
| 580 |
except Exception as e:
|
| 581 |
+
print(f"โ [DEBUG] encode_prompt ์คํจ: {e}")
|
| 582 |
+
# 1์ฐจ ์ฌ์๋: ์ปจํ
์คํธ ์ ๊ฑฐํ๊ณ ์ด๋ฏธ์ง+์ง๋ฌธ๋ง์ผ๋ก ์ฌ์๋
|
| 583 |
+
try:
|
| 584 |
+
base_prompt_retry = f"Human: {'<image>' * len(all_pixel_values)}{prompt}\nAssistant:"
|
| 585 |
+
print(f"๐ [DEBUG] encode_prompt ์ฌ์๋(์ปจํ
์คํธ ์ ๊ฑฐ): {base_prompt_retry}")
|
| 586 |
+
inputs = tokenizer.encode_prompt(
|
| 587 |
+
prompt=base_prompt_retry,
|
| 588 |
+
max_length=2048,
|
| 589 |
+
image_meta=final_meta
|
| 590 |
+
)
|
| 591 |
+
print(f"๐ [DEBUG] encode_prompt ์ฌ์๋ ์ฑ๊ณต: {list(inputs.keys())}")
|
| 592 |
+
except Exception as e2:
|
| 593 |
+
print(f"โ [DEBUG] encode_prompt ์ฌ์๋ ์คํจ: {e2}. ํ
์คํธ-only๋ก ํด๋ฐฑ")
|
| 594 |
+
# ์ต์ข
ํด๋ฐฑ: ํ
์คํธ-only ๊ฒฝ๋ก๋ก ์ ํ(์ด๋ฏธ์ง ๋นํ์ฑํ)
|
| 595 |
+
all_pixel_values = []
|
| 596 |
+
image_processed = False
|
| 597 |
+
inputs = tokenizer(
|
| 598 |
+
formatted_prompt if formatted_prompt else prompt,
|
| 599 |
+
return_tensors="pt",
|
| 600 |
+
padding=True,
|
| 601 |
+
truncation=True,
|
| 602 |
+
max_length=2048,
|
| 603 |
+
)
|
| 604 |
+
if 'token_type_ids' in inputs:
|
| 605 |
+
del inputs['token_type_ids']
|
| 606 |
+
input_ids = inputs['input_ids']
|
| 607 |
+
attention_mask = inputs['attention_mask']
|
| 608 |
else:
|
| 609 |
# ์์ ํด๋ฐฑ
|
| 610 |
print(f"๐ [DEBUG] ๊ธฐ๋ณธ ํ ํฌ๋์ด์ ์ฌ์ฉ (ํด๋ฐฑ)")
|
|
|
|
| 777 |
else:
|
| 778 |
processed_image_metas[key] = value
|
| 779 |
|
| 780 |
+
# ๐ ์์ ๊ฐ๋: image_token_thw๊ฐ ๋น์ ์์ผ ๋ -1 ํ ํฐ์ด ์์ฑ๋์ง ์๋๋ก ๋ฐฉ์ง
|
| 781 |
+
try:
|
| 782 |
+
if 'image_token_thw' in processed_image_metas:
|
| 783 |
+
it = processed_image_metas['image_token_thw']
|
| 784 |
+
if isinstance(it, torch.Tensor) and (it.numel() == 0 or it.shape[-1] != 3):
|
| 785 |
+
print(f"โ ๏ธ [DEBUG] image_token_thw ๋น์ ์: {it.shape if hasattr(it,'shape') else type(it)} -> ์์ ๊ธฐ๋ณธ๊ฐ ์ ์ฉ")
|
| 786 |
+
processed_image_metas['image_token_thw'] = torch.tensor([[1,1,1]] * len(all_pixel_values), dtype=torch.long).unsqueeze(0)
|
| 787 |
+
except Exception as _safe_e:
|
| 788 |
+
print(f"โ ๏ธ [DEBUG] image_token_thw ์์ ํ ์คํจ: {_safe_e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
|
| 790 |
# ์์ ๊ฐ๋: vision_grid_thw๊ฐ [1, N, 3]๋ก ์ค๋ฉด [N, 3]๋ก ๋ณํ
|
| 791 |
try:
|