gbrabbit commited on
Commit
1d1372e
ยท
1 Parent(s): e938fb9

Auto commit at 24-2025-08 18:22:22

Browse files
lily_llm_api/api/routers/generation_router.py CHANGED
@@ -25,7 +25,11 @@ async def generate(request: Request,
25
  user_id: str = Form("anonymous"),
26
  room_id: str = Form("default"),
27
  use_context: bool = Form(True),
28
- session_id: str = Form(None)):
 
 
 
 
29
 
30
  if not is_model_loaded():
31
  raise HTTPException(status_code=503, detail="๋ชจ๋ธ์ด ๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
@@ -74,7 +78,7 @@ async def generate(request: Request,
74
 
75
  try:
76
  # generate_sync ํ•จ์ˆ˜ ํ˜ธ์ถœ (์ปจํ…์ŠคํŠธ ํฌํ•จ)
77
- result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id)
78
 
79
  if "error" in result:
80
  raise HTTPException(status_code=500, detail=result["error"])
 
25
  user_id: str = Form("anonymous"),
26
  room_id: str = Form("default"),
27
  use_context: bool = Form(True),
28
+ session_id: str = Form(None),
29
+ use_rag_images: bool = Form(False),
30
+ use_rag_text: bool = Form(False),
31
+ document_id: str = Form(None),
32
+ image_short_side: int = Form(None)):
33
 
34
  if not is_model_loaded():
35
  raise HTTPException(status_code=503, detail="๋ชจ๋ธ์ด ๋กœ๋“œ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
 
78
 
79
  try:
80
  # generate_sync ํ•จ์ˆ˜ ํ˜ธ์ถœ (์ปจํ…์ŠคํŠธ ํฌํ•จ)
81
+ result = generate_sync(prompt, image_data_list, use_context=use_context, session_id=session_id, user_id=user_id, room_id=room_id, use_rag_images=use_rag_images, use_rag_text=use_rag_text, document_id=document_id, image_short_side=image_short_side)
82
 
83
  if "error" in result:
84
  raise HTTPException(status_code=500, detail=result["error"])
lily_llm_api/services/generation_service.py CHANGED
@@ -2,6 +2,7 @@
2
  Generation service for Lily LLM API
3
  """
4
  import logging
 
5
  import time
6
  from typing import Optional, List, Dict
7
  from pathlib import Path
@@ -16,6 +17,11 @@ logger = logging.getLogger(__name__)
16
  # ์ฃผ์˜: ํ”„๋กœ์„ธ์Šค ์žฌ์‹œ์ž‘ ์‹œ ์ดˆ๊ธฐํ™”๋จ. ์ตœ๋Œ€ 4์žฅ ๋ณด๊ด€.
17
  _session_image_cache: Dict[str, List[bytes]] = {}
18
 
 
 
 
 
 
19
  # ์„ ํƒ์ : ๋ฒกํ„ฐ ์Šคํ† ์–ด์—์„œ ์ตœ๊ทผ ๋ฌธ์„œ ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ์ง€์›
20
  try:
21
  from lily_llm_core.vector_store_manager import vector_store_manager, SimpleVectorStore
@@ -26,7 +32,9 @@ except Exception:
26
  def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
27
  temperature: Optional[float] = None, top_p: Optional[float] = None,
28
  do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
29
- user_id: str = "anonymous", room_id: str = "default") -> dict:
 
 
30
  """[์ตœ์ ํ™”] ๋ชจ๋ธ ์ƒ์„ฑ์„ ์ฒ˜๋ฆฌํ•˜๋Š” ํ†ตํ•ฉ ๋™๊ธฐ ํ•จ์ˆ˜"""
31
  try:
32
  from .model_service import get_current_profile, get_current_model
@@ -92,8 +100,8 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
92
  all_image_data.extend(cached_imgs)
93
  print(f"๐Ÿ” [DEBUG] ์„ธ์…˜ ์บ์‹œ์—์„œ ์ด์ „ ์ด๋ฏธ์ง€ {len(cached_imgs)}๊ฐœ ๋ณต๊ตฌ (์„ธ์…˜: {session_id})")
94
 
95
- # ์ถ”๊ฐ€ ๋ณต๊ตฌ: ์—ฌ์ „ํžˆ ์ด๋ฏธ์ง€๊ฐ€ ์—†๊ณ  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ์ด๋ฉด, ์ตœ๊ทผ RAG ๋ฌธ์„œ์—์„œ ์ด๋ฏธ์ง€ ๋ฐ”์ดํŠธ ๋ณต์›
96
- if (not all_image_data or len([img for img in all_image_data if img]) == 0) and getattr(current_profile, 'multimodal', False):
97
  try:
98
  if vector_store_manager is not None:
99
  # ์‚ฌ์šฉ์ž ๋ฌธ์„œ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ (์ตœ์‹ ์ˆœ ์ •๋ ฌ)
@@ -129,9 +137,12 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
129
  print("โš ๏ธ [DEBUG] vector_store_manager ๋ฏธ์‚ฌ์šฉ - ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ๋น„ํ™œ์„ฑํ™”")
130
  except Exception as e:
131
  print(f"โš ๏ธ [DEBUG] RAG ๊ธฐ๋ฐ˜ ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ์‹คํŒจ: {e}")
 
 
132
 
133
  # ํ•ญ์ƒ ์ฐธ์กฐ ๊ฐ€๋Šฅํ•œ max_images ์ •์˜ (์ด๋ฏธ์ง€ ์—†์œผ๋ฉด 0)
134
- max_images = min(len([img for img in all_image_data if img]) if all_image_data else 0, 4)
 
135
 
136
  if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
137
  print(f"๐Ÿ” [DEBUG] ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์‹œ์ž‘ - ์ด ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜: {len([img for img in all_image_data if img])}")
@@ -146,6 +157,22 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
146
  if image_bytes:
147
  try:
148
  pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # ๐Ÿ”„ ๊ณต์‹ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ ์‚ฌ์šฉ
150
  if processor and hasattr(processor, 'image_processor'):
151
  processed = processor.image_processor(pil_image)
@@ -169,6 +196,22 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
169
  if image_bytes:
170
  try:
171
  pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  # ๐Ÿ”„ ๊ณต์‹ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ ์‚ฌ์šฉ
173
  if processor and hasattr(processor, 'image_processor'):
174
  # KananaVImageProcessor๋Š” ๊ธฐ๋ณธ ํŒŒ๋ผ๋ฏธํ„ฐ๋งŒ ์ง€์›
@@ -189,10 +232,71 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
189
  logger.error(f"โŒ ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
190
  combined_image_metas = {}
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  # --- 2. ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ ---
193
  print(f"๐Ÿ” [DEBUG] ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ ์‹œ์ž‘")
194
 
195
- # ์ปจํ…์ŠคํŠธ ํ†ตํ•ฉ (๋Œ€ํ™” ๊ธฐ๋ก + RAG ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ํฌํ•จ) - ๋ชจ๋ธ๋ณ„ ์ตœ์ ํ™”
196
  context_prompt = ""
197
  if use_context and session_id:
198
  try:
@@ -209,6 +313,34 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
209
  except Exception as e:
210
  print(f"โš ๏ธ [DEBUG] ์ปจํ…์ŠคํŠธ ๋กœ๋“œ ์‹คํŒจ: {e}")
211
  context_prompt = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  # ์ด๋ฏธ์ง€ ๋ฐ”์ดํŠธ๋ฅผ ์„ธ์…˜ ์บ์‹œ์— ๋ณด๊ด€ (๋‹ค์Œ ํ„ด์— ์žฌ์‚ฌ์šฉ)
213
  if session_id:
214
  # ์›๋ณธ ์š”์ฒญ์— ์ด๋ฏธ์ง€๊ฐ€ ์žˆ์—ˆ๋‹ค๋ฉด ๊ทธ๊ฑธ ์šฐ์„  ๋ณด๊ด€, ์—†์œผ๋ฉด ๋ณต๊ตฌ๋œ ์ด๋ฏธ์ง€ ์œ ์ง€
@@ -226,6 +358,16 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
226
 
227
  # formatted_prompt ์ดˆ๊ธฐํ™”
228
  formatted_prompt = None
 
 
 
 
 
 
 
 
 
 
229
 
230
  # ๐Ÿ”„ ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (๊ณต์‹ ๋ฐฉ์‹)
231
  if all_pixel_values and len(all_pixel_values) > 0:
@@ -233,6 +375,7 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
233
  num_images = len(all_pixel_values)
234
  image_tokens = "<image>" * num_images # ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜๋งŒํผ <image> ํ† ํฐ ์ƒ์„ฑ
235
  # ๋‹ต๋ณ€ ์œ ๋„๋ฅผ ์œ„ํ•ด Assistant ํ”„๋ฆฌํ”ฝ์Šค ์ถ”๊ฐ€
 
236
  formatted_prompt = f"Human: {image_tokens}{prompt}\nAssistant:"
237
  print(f"๐Ÿ” [DEBUG] ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (๊ณต์‹ ํ˜•์‹): {formatted_prompt}")
238
  print(f"๐Ÿ” [DEBUG] ์ด๋ฏธ์ง€ ํ† ํฐ ์ƒ์„ฑ: {num_images}๊ฐœ ์ด๋ฏธ์ง€ -> {image_tokens}")
@@ -400,20 +543,68 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
400
  # ๐Ÿ”„ ์ตœ์ข… ๊ฒ€์ฆ
401
  print(f"๐Ÿ” [DEBUG] ์ตœ์ข… input_ids ํƒ€์ž…: {type(input_ids)}, shape: {input_ids.shape}")
402
  print(f"๐Ÿ” [DEBUG] ์ตœ์ข… attention_mask ํƒ€์ž…: {type(attention_mask)}, shape: {attention_mask.shape}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  except Exception as e:
404
- print(f"โŒ [DEBUG] encode_prompt ์‹คํŒจ: {e}, ํด๋ฐฑ ์‚ฌ์šฉ")
405
- # ํด๋ฐฑ: ๊ธฐ๋ณธ ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
406
- inputs = tokenizer(
407
- formatted_prompt,
408
- return_tensors="pt",
409
- padding=True,
410
- truncation=True,
411
- max_length=2048,
412
- )
413
- if 'token_type_ids' in inputs:
414
- del inputs['token_type_ids']
415
- input_ids = inputs['input_ids']
416
- attention_mask = inputs['attention_mask']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  else:
418
  # ์•ˆ์ „ ํด๋ฐฑ
419
  print(f"๐Ÿ” [DEBUG] ๊ธฐ๋ณธ ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ (ํด๋ฐฑ)")
@@ -586,25 +777,15 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
586
  else:
587
  processed_image_metas[key] = value
588
 
589
- # ๐Ÿ”„ ์ฐธ๊ณ  ๋กœ๊ทธ๋งŒ ์ถœ๋ ฅ: ์ด๋ฏธ์ง€ ํ† ํฐ ์ˆ˜ ์ถ”์ • (์กฐ์ •์€ ํ•˜์ง€ ์•Š์Œ)
590
- if 'image_token_thw' in processed_image_metas:
591
- image_token_thw = processed_image_metas['image_token_thw']
592
- if isinstance(image_token_thw, torch.Tensor):
593
- total_image_tokens = 0
594
- print(f"๐Ÿ” [DEBUG] image_token_thw shape: {image_token_thw.shape}")
595
- print(f"๐Ÿ” [DEBUG] image_token_thw ๋‚ด์šฉ: {image_token_thw}")
596
- for i in range(image_token_thw.shape[0]):
597
- token_info = image_token_thw[i]
598
- if len(token_info) == 3:
599
- t, h, w = token_info
600
- total_image_tokens += t * h * w
601
- elif len(token_info) == 2:
602
- h, w = token_info
603
- total_image_tokens += h * w
604
- print(f"๐Ÿ” [DEBUG] ๊ณ„์‚ฐ๋œ ์ด ์ด๋ฏธ์ง€ ํ† ํฐ ์ˆ˜(์ฐธ๊ณ ): {total_image_tokens}")
605
- if isinstance(total_image_tokens, torch.Tensor):
606
- total_image_tokens = total_image_tokens.sum().item()
607
- print(f"๐Ÿ” [DEBUG] pixel_values ๊ธธ์ด: {pixel_values.shape[0]}, ์˜ˆ์ƒ: {total_image_tokens} (์กฐ์ • ์•ˆํ•จ)")
608
 
609
  # ์•ˆ์ „ ๊ฐ€๋“œ: vision_grid_thw๊ฐ€ [1, N, 3]๋กœ ์˜ค๋ฉด [N, 3]๋กœ ๋ณ€ํ™˜
610
  try:
 
2
  Generation service for Lily LLM API
3
  """
4
  import logging
5
+ import os
6
  import time
7
  from typing import Optional, List, Dict
8
  from pathlib import Path
 
17
  # ์ฃผ์˜: ํ”„๋กœ์„ธ์Šค ์žฌ์‹œ์ž‘ ์‹œ ์ดˆ๊ธฐํ™”๋จ. ์ตœ๋Œ€ 4์žฅ ๋ณด๊ด€.
18
  _session_image_cache: Dict[str, List[bytes]] = {}
19
 
20
+ # ์ˆ˜๋™ ๋ฆฌ์‚ฌ์ด์ฆˆ ๊ฐ•์ œ ์„ค์ • (์ฝ”๋“œ์—์„œ ์ง์ ‘ ์ˆ˜์ •์šฉ). ์˜ˆ: 128, 256, 512 ... None์ด๋ฉด ๋น„ํ™œ์„ฑํ™”
21
+ DEFAULT_IMAGE_SHORT_SIDE: Optional[int] = 128
22
+
23
+ max_images_limit = 4
24
+
25
  # ์„ ํƒ์ : ๋ฒกํ„ฐ ์Šคํ† ์–ด์—์„œ ์ตœ๊ทผ ๋ฌธ์„œ ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ์ง€์›
26
  try:
27
  from lily_llm_core.vector_store_manager import vector_store_manager, SimpleVectorStore
 
32
  def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_length: Optional[int] = None,
33
  temperature: Optional[float] = None, top_p: Optional[float] = None,
34
  do_sample: Optional[bool] = None, use_context: bool = True, session_id: str = None,
35
+ user_id: str = "anonymous", room_id: str = "default", use_rag_images: bool = False,
36
+ use_rag_text: bool = False, document_id: Optional[str] = None,
37
+ image_short_side: Optional[int] = None) -> dict:
38
  """[์ตœ์ ํ™”] ๋ชจ๋ธ ์ƒ์„ฑ์„ ์ฒ˜๋ฆฌํ•˜๋Š” ํ†ตํ•ฉ ๋™๊ธฐ ํ•จ์ˆ˜"""
39
  try:
40
  from .model_service import get_current_profile, get_current_model
 
100
  all_image_data.extend(cached_imgs)
101
  print(f"๐Ÿ” [DEBUG] ์„ธ์…˜ ์บ์‹œ์—์„œ ์ด์ „ ์ด๋ฏธ์ง€ {len(cached_imgs)}๊ฐœ ๋ณต๊ตฌ (์„ธ์…˜: {session_id})")
102
 
103
+ # ์ถ”๊ฐ€ ๋ณต๊ตฌ: ์—ฌ์ „ํžˆ ์ด๋ฏธ์ง€๊ฐ€ ์—†๊ณ  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ์ด๋ฉฐ, ๋ช…์‹œ์ ์œผ๋กœ ํ—ˆ์šฉ๋œ ๊ฒฝ์šฐ์—๋งŒ RAG์—์„œ ์ด๋ฏธ์ง€ ๋ณต์›
104
+ if use_rag_images and (not all_image_data or len([img for img in all_image_data if img]) == 0) and getattr(current_profile, 'multimodal', False):
105
  try:
106
  if vector_store_manager is not None:
107
  # ์‚ฌ์šฉ์ž ๋ฌธ์„œ ๋ชฉ๋ก ๊ฐ€์ ธ์˜ค๊ธฐ (์ตœ์‹ ์ˆœ ์ •๋ ฌ)
 
137
  print("โš ๏ธ [DEBUG] vector_store_manager ๋ฏธ์‚ฌ์šฉ - ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ๋น„ํ™œ์„ฑํ™”")
138
  except Exception as e:
139
  print(f"โš ๏ธ [DEBUG] RAG ๊ธฐ๋ฐ˜ ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ์‹คํŒจ: {e}")
140
+ elif not use_rag_images and getattr(current_profile, 'multimodal', False):
141
+ print("๐Ÿ” [DEBUG] RAG ์ด๋ฏธ์ง€ ๋ณต๊ตฌ ๋น„ํ™œ์„ฑํ™”๋จ(use_rag_images=False) - ํ…์ŠคํŠธ ์ „์šฉ ์œ ์ง€")
142
 
143
  # ํ•ญ์ƒ ์ฐธ์กฐ ๊ฐ€๋Šฅํ•œ max_images ์ •์˜ (์ด๋ฏธ์ง€ ์—†์œผ๋ฉด 0)
144
+ # 1์ฐจ ์ƒํ•œ์€ 4์žฅ์œผ๋กœ ์ œํ•œ (์ตœ์ข… ์„ ํƒ์€ ์˜ˆ์‚ฐ ๊ธฐ๋ฐ˜ ๋™์  ์„ ํƒ์—์„œ ๊ฒฐ์ •)
145
+ max_images = min(len([img for img in all_image_data if img]) if all_image_data else 0, max_images_limit)
146
 
147
  if all_image_data and len([img for img in all_image_data if img]) > 0 and getattr(current_profile, 'multimodal', False):
148
  print(f"๐Ÿ” [DEBUG] ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ์‹œ์ž‘ - ์ด ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜: {len([img for img in all_image_data if img])}")
 
157
  if image_bytes:
158
  try:
159
  pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
160
+ # ์ˆ˜๋™ ๋ฆฌ์‚ฌ์ด์ฆˆ ์šฐ์„ ์ˆœ์œ„: API ํŒŒ๋ผ๋ฏธํ„ฐ > ์ฝ”๋“œ ์ƒ์ˆ˜ > ํ™˜๊ฒฝ๋ณ€์ˆ˜
161
+ env_short_side = os.getenv('LILY_IMAGE_SHORT_SIDE')
162
+ try:
163
+ env_short_side_val = int(env_short_side) if env_short_side is not None else None
164
+ except Exception:
165
+ env_short_side_val = None
166
+ effective_short_side = image_short_side or DEFAULT_IMAGE_SHORT_SIDE or env_short_side_val
167
+ if effective_short_side:
168
+ s = max(128, min(int(effective_short_side), 2048))
169
+ w, h = pil_image.size
170
+ if w > 0 and h > 0:
171
+ if w <= h:
172
+ new_w = s; new_h = int(h * (s / w))
173
+ else:
174
+ new_h = s; new_w = int(w * (s / h))
175
+ pil_image = pil_image.resize((max(1, new_w), max(1, new_h)))
176
  # ๐Ÿ”„ ๊ณต์‹ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ ์‚ฌ์šฉ
177
  if processor and hasattr(processor, 'image_processor'):
178
  processed = processor.image_processor(pil_image)
 
196
  if image_bytes:
197
  try:
198
  pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
199
+ # ์ˆ˜๋™ ๋ฆฌ์‚ฌ์ด์ฆˆ ์šฐ์„ ์ˆœ์œ„: API ํŒŒ๋ผ๋ฏธํ„ฐ > ์ฝ”๋“œ ์ƒ์ˆ˜ > ํ™˜๊ฒฝ๋ณ€์ˆ˜
200
+ env_short_side = os.getenv('LILY_IMAGE_SHORT_SIDE')
201
+ try:
202
+ env_short_side_val = int(env_short_side) if env_short_side is not None else None
203
+ except Exception:
204
+ env_short_side_val = None
205
+ effective_short_side = image_short_side or DEFAULT_IMAGE_SHORT_SIDE or env_short_side_val
206
+ if effective_short_side:
207
+ s = max(128, min(int(effective_short_side), 2048))
208
+ w, h = pil_image.size
209
+ if w > 0 and h > 0:
210
+ if w <= h:
211
+ new_w = s; new_h = int(h * (s / w))
212
+ else:
213
+ new_h = s; new_w = int(w * (s / h))
214
+ pil_image = pil_image.resize((max(1, new_w), max(1, new_h)))
215
  # ๐Ÿ”„ ๊ณต์‹ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ ์‚ฌ์šฉ
216
  if processor and hasattr(processor, 'image_processor'):
217
  # KananaVImageProcessor๋Š” ๊ธฐ๋ณธ ํŒŒ๋ผ๋ฏธํ„ฐ๋งŒ ์ง€์›
 
232
  logger.error(f"โŒ ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ ์‹คํŒจ: {e}")
233
  combined_image_metas = {}
234
 
235
+ # ๐Ÿ”ง ์ด๋ฏธ์ง€ ํ† ํฐ ์˜ˆ์‚ฐ ๊ธฐ๋ฐ˜ ๋™์  ์„ ํƒ (๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๊ธธ์ด ์ดˆ๊ณผ ๋ฐฉ์ง€)
236
+ try:
237
+ # 1) ์ด๋ฏธ์ง€๋ณ„ ํ† ํฐ ์ˆ˜ ์‚ฐ์ถœ
238
+ per_image_tokens: List[int] = []
239
+ if isinstance(combined_image_metas, dict) and 'image_token_thw' in combined_image_metas:
240
+ for thw in combined_image_metas['image_token_thw']:
241
+ if isinstance(thw, (list, tuple)) and len(thw) == 3:
242
+ per_image_tokens.append(int(thw[0]) * int(thw[1]) * int(thw[2]))
243
+ else:
244
+ per_image_tokens.append(0)
245
+ else:
246
+ # ๋ฉ”ํƒ€๊ฐ€ ์—†์œผ๋ฉด ๋ณด์ˆ˜์ ์œผ๋กœ ํฐ ๊ฐ’์œผ๋กœ ๊ฐ„์ฃผํ•˜์—ฌ ํ…์ŠคํŠธ-only๋กœ ์œ ๋„
247
+ per_image_tokens = [3000] * len(all_pixel_values)
248
+
249
+ # 2) ํ…์ŠคํŠธ ๊ธธ์ด ์ธก์ • (์ด๋ฏธ์ง€ ํ† ํฐ ์ œ์™ธํ•œ ํ”„๋กฌํ”„ํŠธ)
250
+ base_text_prompt = f"Human: {prompt}\nAssistant:"
251
+ text_inputs = tokenizer(
252
+ base_text_prompt,
253
+ return_tensors="pt",
254
+ padding=False,
255
+ truncation=True,
256
+ max_length=2048,
257
+ )
258
+ text_len = int(text_inputs['input_ids'].shape[-1]) if 'input_ids' in text_inputs else 0
259
+
260
+ # 3) ์ด๋ฏธ์ง€ ํ† ํฐ ์˜ˆ์‚ฐ ๊ณ„์‚ฐ (์—ฌ์œ  ๋งˆ์ง„ 16)
261
+ total_budget = 2048
262
+ margin = 16
263
+ allowed_image_tokens = max(0, total_budget - text_len - margin)
264
+ print(f"๐Ÿ” [DEBUG] ํ† ํฐ ์˜ˆ์‚ฐ: text={text_len}, allowed_image={allowed_image_tokens}")
265
+
266
+ # 4) ์˜ˆ์‚ฐ ๋‚ด ์ตœ๋Œ€ ์ด๋ฏธ์ง€ ์ˆ˜ ์„ ํƒ (์•ž์—์„œ๋ถ€ํ„ฐ ๊ทธ๋ฆฌ๋””)
267
+ selected_indices: List[int] = []
268
+ cum = 0
269
+ for i, tok in enumerate(per_image_tokens):
270
+ if cum + tok <= allowed_image_tokens:
271
+ selected_indices.append(i)
272
+ cum += tok
273
+ else:
274
+ break
275
+
276
+ # 5) ์ ์šฉ: ์„ ํƒ๋œ ์ด๋ฏธ์ง€๋งŒ ์œ ์ง€
277
+ if selected_indices and len(selected_indices) < len(all_pixel_values):
278
+ print(f"๐Ÿ”ง [DEBUG] ์ด๋ฏธ์ง€ ์„ ํƒ: {len(all_pixel_values)} -> {len(selected_indices)} (cum_tokens={cum})")
279
+ all_pixel_values = [all_pixel_values[i] for i in selected_indices]
280
+ if isinstance(combined_image_metas, dict):
281
+ for key in list(combined_image_metas.keys()):
282
+ try:
283
+ combined_image_metas[key] = [combined_image_metas[key][i] for i in selected_indices]
284
+ except Exception:
285
+ pass
286
+ # ์ตœ์ข… max_images ๊ฐฑ์‹ 
287
+ max_images = len(all_pixel_values)
288
+ elif not selected_indices:
289
+ print("โš ๏ธ [DEBUG] ์ด๋ฏธ์ง€ ์˜ˆ์‚ฐ ๋ถ€์กฑ โ†’ ํ…์ŠคํŠธ-only๋กœ ์ „ํ™˜")
290
+ all_pixel_values = []
291
+ combined_image_metas = {}
292
+ max_images = 0
293
+ except Exception as _e_budget:
294
+ print(f"โš ๏ธ [DEBUG] ์ด๋ฏธ์ง€ ์˜ˆ์‚ฐ ๊ณ„์‚ฐ ์‹คํŒจ: {_e_budget}")
295
+
296
  # --- 2. ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ ---
297
  print(f"๐Ÿ” [DEBUG] ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ ์‹œ์ž‘")
298
 
299
+ # ์ปจํ…์ŠคํŠธ ํ†ตํ•ฉ (๋Œ€ํ™” ๊ธฐ๋ก + ์„ ํƒ์  RAG ํ…์ŠคํŠธ ๊ฒ€์ƒ‰) - ๋ชจ๋ธ๋ณ„ ์ตœ์ ํ™”
300
  context_prompt = ""
301
  if use_context and session_id:
302
  try:
 
313
  except Exception as e:
314
  print(f"โš ๏ธ [DEBUG] ์ปจํ…์ŠคํŠธ ๋กœ๋“œ ์‹คํŒจ: {e}")
315
  context_prompt = ""
316
+
317
+ # ์„ ํƒ์  RAG ํ…์ŠคํŠธ ๊ฒ€์ƒ‰ ํ†ตํ•ฉ
318
+ if use_rag_text:
319
+ try:
320
+ from lily_llm_core.vector_store_manager import vector_store_manager as _vsm
321
+ # ๋Œ€์ƒ ๋ฌธ์„œ ์„ ํƒ: ๋ช…์‹œ์  document_id > ์ตœ์‹  ๋ฌธ์„œ
322
+ target_doc_id = document_id
323
+ if not target_doc_id:
324
+ user_docs = _vsm.get_user_documents(user_id)
325
+ if user_docs:
326
+ user_docs.sort(key=lambda d: d.get('last_updated') or d.get('created_at') or 0, reverse=True)
327
+ target_doc_id = user_docs[0].get('document_id')
328
+ # ๊ฒ€์ƒ‰
329
+ if target_doc_id:
330
+ docs = _vsm.search_similar(user_id, target_doc_id, prompt, k=3)
331
+ rag_contexts = []
332
+ for d in docs:
333
+ try:
334
+ preview = getattr(d, 'page_content', '')
335
+ rag_contexts.append(preview)
336
+ except Exception:
337
+ continue
338
+ if rag_contexts:
339
+ rag_text = "\n\n".join(rag_contexts)
340
+ context_prompt = (context_prompt or "") + f"[RAG]\n{rag_text}\n\n"
341
+ print(f"๐Ÿ” [DEBUG] RAG ํ…์ŠคํŠธ ์ปจํ…์ŠคํŠธ ํ†ตํ•ฉ: {len(rag_text)}์ž, ๋ฌธ์„œ={target_doc_id}")
342
+ except Exception as _re:
343
+ print(f"โš ๏ธ [DEBUG] RAG ํ…์ŠคํŠธ ๊ฒ€์ƒ‰ ์‹คํŒจ: {_re}")
344
  # ์ด๋ฏธ์ง€ ๋ฐ”์ดํŠธ๋ฅผ ์„ธ์…˜ ์บ์‹œ์— ๋ณด๊ด€ (๋‹ค์Œ ํ„ด์— ์žฌ์‚ฌ์šฉ)
345
  if session_id:
346
  # ์›๋ณธ ์š”์ฒญ์— ์ด๋ฏธ์ง€๊ฐ€ ์žˆ์—ˆ๋‹ค๋ฉด ๊ทธ๊ฑธ ์šฐ์„  ๋ณด๊ด€, ์—†์œผ๋ฉด ๋ณต๊ตฌ๋œ ์ด๋ฏธ์ง€ ์œ ์ง€
 
358
 
359
  # formatted_prompt ์ดˆ๊ธฐํ™”
360
  formatted_prompt = None
361
+ # ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ์šฉ RAG ์Šค๋‹ˆํŽซ(๊ธธ์ด ์ œํ•œ) ์ค€๋น„
362
+ rag_snippet_short = ""
363
+ if context_prompt and isinstance(context_prompt, str):
364
+ try:
365
+ # ๊ณผ๋„ํ•œ ๊ธธ์ด ๋ฐฉ์ง€: ์šฐ์„  256์ž๋กœ ์ œํ•œ
366
+ rag_snippet_short = context_prompt[:256]
367
+ if not rag_snippet_short.endswith("\n"):
368
+ rag_snippet_short += "\n"
369
+ except Exception:
370
+ rag_snippet_short = ""
371
 
372
  # ๐Ÿ”„ ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (๊ณต์‹ ๋ฐฉ์‹)
373
  if all_pixel_values and len(all_pixel_values) > 0:
 
375
  num_images = len(all_pixel_values)
376
  image_tokens = "<image>" * num_images # ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜๋งŒํผ <image> ํ† ํฐ ์ƒ์„ฑ
377
  # ๋‹ต๋ณ€ ์œ ๋„๋ฅผ ์œ„ํ•ด Assistant ํ”„๋ฆฌํ”ฝ์Šค ์ถ”๊ฐ€
378
+ # ๊ธธ์ด ์ดˆ๊ณผ๋ฅผ ๋ฐฉ์ง€ํ•˜๊ธฐ ์œ„ํ•ด ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ๊ฒฝ๋กœ์—์„œ๋Š” ์‚ฌ์šฉ์ž ์ž…๋ ฅ๋งŒ ํฌํ•จ
379
  formatted_prompt = f"Human: {image_tokens}{prompt}\nAssistant:"
380
  print(f"๐Ÿ” [DEBUG] ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ (๊ณต์‹ ํ˜•์‹): {formatted_prompt}")
381
  print(f"๐Ÿ” [DEBUG] ์ด๋ฏธ์ง€ ํ† ํฐ ์ƒ์„ฑ: {num_images}๊ฐœ ์ด๋ฏธ์ง€ -> {image_tokens}")
 
543
  # ๐Ÿ”„ ์ตœ์ข… ๊ฒ€์ฆ
544
  print(f"๐Ÿ” [DEBUG] ์ตœ์ข… input_ids ํƒ€์ž…: {type(input_ids)}, shape: {input_ids.shape}")
545
  print(f"๐Ÿ” [DEBUG] ์ตœ์ข… attention_mask ํƒ€์ž…: {type(attention_mask)}, shape: {attention_mask.shape}")
546
+
547
+ # -1 ํ† ํฐ(์ด๋ฏธ์ง€ ์ž๋ฆฌํ‘œ์‹œ์ž) ์กด์žฌ ๊ฒ€์ฆ ๋ฐ ๋‹จ๊ณ„์  ์žฌ์‹œ๋„
548
+ try:
549
+ neg_exists = (input_ids == -1).any().item() if hasattr(input_ids, 'any') else False
550
+ except Exception:
551
+ neg_exists = False
552
+ if not neg_exists and len(all_pixel_values) > 0:
553
+ print("โš ๏ธ [DEBUG] -1 ํ† ํฐ ์—†์Œ โ†’ RAG ์Šค๋‹ˆํŽซ ๊ธธ์ด ์ค„์—ฌ ์žฌ์‹œ๋„")
554
+ for limit in [128, 64, 0]:
555
+ try:
556
+ base_snippet = (context_prompt or "")[:limit]
557
+ if base_snippet and not base_snippet.endswith("\n"):
558
+ base_snippet += "\n"
559
+ base_prompt_retry = f"Human: {'<image>' * len(all_pixel_values)}{base_snippet}{prompt}\nAssistant:"
560
+ print(f"๐Ÿ” [DEBUG] ์žฌ์‹œ๋„ limit={limit}: {base_prompt_retry}")
561
+ inputs_retry = tokenizer.encode_prompt(
562
+ prompt=base_prompt_retry,
563
+ max_length=2048,
564
+ image_meta=final_meta
565
+ )
566
+ # ์ •๊ทœํ™”
567
+ if 'seq_length' in inputs_retry:
568
+ del inputs_retry['seq_length']
569
+ _input_ids = inputs_retry['input_ids'][0] if isinstance(inputs_retry['input_ids'], tuple) else inputs_retry['input_ids']
570
+ _neg = (_input_ids == -1).any().item() if hasattr(_input_ids, 'any') else False
571
+ if _neg:
572
+ inputs = inputs_retry
573
+ input_ids = _input_ids
574
+ attention_mask = inputs_retry['attention_mask'][0] if isinstance(inputs_retry['attention_mask'], tuple) else inputs_retry['attention_mask']
575
+ formatted_prompt = base_prompt_retry
576
+ print("โœ… [DEBUG] ์žฌ์‹œ๋„ ์„ฑ๊ณต: -1 ํ† ํฐ ํ™•๋ณด")
577
+ break
578
+ except Exception as _re_try:
579
+ print(f"โš ๏ธ [DEBUG] ์žฌ์‹œ๋„ ์‹คํŒจ(limit={limit}): {_re_try}")
580
  except Exception as e:
581
+ print(f"โŒ [DEBUG] encode_prompt ์‹คํŒจ: {e}")
582
+ # 1์ฐจ ์žฌ์‹œ๋„: ์ปจํ…์ŠคํŠธ ์ œ๊ฑฐํ•˜๊ณ  ์ด๋ฏธ์ง€+์งˆ๋ฌธ๋งŒ์œผ๋กœ ์žฌ์‹œ๋„
583
+ try:
584
+ base_prompt_retry = f"Human: {'<image>' * len(all_pixel_values)}{prompt}\nAssistant:"
585
+ print(f"๐Ÿ” [DEBUG] encode_prompt ์žฌ์‹œ๋„(์ปจํ…์ŠคํŠธ ์ œ๊ฑฐ): {base_prompt_retry}")
586
+ inputs = tokenizer.encode_prompt(
587
+ prompt=base_prompt_retry,
588
+ max_length=2048,
589
+ image_meta=final_meta
590
+ )
591
+ print(f"๐Ÿ” [DEBUG] encode_prompt ์žฌ์‹œ๋„ ์„ฑ๊ณต: {list(inputs.keys())}")
592
+ except Exception as e2:
593
+ print(f"โŒ [DEBUG] encode_prompt ์žฌ์‹œ๋„ ์‹คํŒจ: {e2}. ํ…์ŠคํŠธ-only๋กœ ํด๋ฐฑ")
594
+ # ์ตœ์ข… ํด๋ฐฑ: ํ…์ŠคํŠธ-only ๊ฒฝ๋กœ๋กœ ์ „ํ™˜(์ด๋ฏธ์ง€ ๋น„ํ™œ์„ฑํ™”)
595
+ all_pixel_values = []
596
+ image_processed = False
597
+ inputs = tokenizer(
598
+ formatted_prompt if formatted_prompt else prompt,
599
+ return_tensors="pt",
600
+ padding=True,
601
+ truncation=True,
602
+ max_length=2048,
603
+ )
604
+ if 'token_type_ids' in inputs:
605
+ del inputs['token_type_ids']
606
+ input_ids = inputs['input_ids']
607
+ attention_mask = inputs['attention_mask']
608
  else:
609
  # ์•ˆ์ „ ํด๋ฐฑ
610
  print(f"๐Ÿ” [DEBUG] ๊ธฐ๋ณธ ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ (ํด๋ฐฑ)")
 
777
  else:
778
  processed_image_metas[key] = value
779
 
780
+ # ๐Ÿ”’ ์•ˆ์ „ ๊ฐ€๋“œ: image_token_thw๊ฐ€ ๋น„์ •์ƒ์ผ ๋•Œ -1 ํ† ํฐ์ด ์ƒ์„ฑ๋˜์ง€ ์•Š๋„๋ก ๋ฐฉ์ง€
781
+ try:
782
+ if 'image_token_thw' in processed_image_metas:
783
+ it = processed_image_metas['image_token_thw']
784
+ if isinstance(it, torch.Tensor) and (it.numel() == 0 or it.shape[-1] != 3):
785
+ print(f"โš ๏ธ [DEBUG] image_token_thw ๋น„์ •์ƒ: {it.shape if hasattr(it,'shape') else type(it)} -> ์•ˆ์ „ ๊ธฐ๋ณธ๊ฐ’ ์ ์šฉ")
786
+ processed_image_metas['image_token_thw'] = torch.tensor([[1,1,1]] * len(all_pixel_values), dtype=torch.long).unsqueeze(0)
787
+ except Exception as _safe_e:
788
+ print(f"โš ๏ธ [DEBUG] image_token_thw ์•ˆ์ „ํ™” ์‹คํŒจ: {_safe_e}")
 
 
 
 
 
 
 
 
 
 
789
 
790
  # ์•ˆ์ „ ๊ฐ€๋“œ: vision_grid_thw๊ฐ€ [1, N, 3]๋กœ ์˜ค๋ฉด [N, 3]๋กœ ๋ณ€ํ™˜
791
  try: