Spaces:
Sleeping
Sleeping
Auto commit at 23-2025-08 13:31:17
Browse files
lily_llm_api/app_v2_origin.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lily_llm_api/services/generation_service.py
CHANGED
|
@@ -358,24 +358,35 @@ def generate_sync(prompt: str, image_data_list: Optional[List[bytes]], max_lengt
|
|
| 358 |
if 'vision_grid_thw' in combined_image_metas:
|
| 359 |
vision_grid = combined_image_metas['vision_grid_thw']
|
| 360 |
if isinstance(vision_grid, list):
|
| 361 |
-
# π Kanana λͺ¨λΈ μꡬμ¬ν:
|
| 362 |
if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
|
| 363 |
-
# [(1, 34, 52)] -> (1, 34, 52) ν
μλ‘ λ³ν
|
| 364 |
t, h, w = vision_grid[0]
|
| 365 |
-
# π
|
| 366 |
-
processed_image_metas['vision_grid_thw'] = torch.tensor([[t, h, w]], dtype=torch.long)
|
| 367 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
| 368 |
else:
|
| 369 |
-
# π λ€λ₯Έ ννμ κ²½μ°
|
| 370 |
-
processed_image_metas['vision_grid_thw'] = torch.tensor(vision_grid, dtype=torch.long)
|
| 371 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν (κΈ°λ³Έ): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
| 372 |
else:
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
-
# π λ€λ₯Έ
|
| 376 |
for key, value in combined_image_metas.items():
|
| 377 |
if key != 'vision_grid_thw':
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
generate_kwargs = {
|
| 381 |
'input_ids': input_ids,
|
|
|
|
| 358 |
if 'vision_grid_thw' in combined_image_metas:
|
| 359 |
vision_grid = combined_image_metas['vision_grid_thw']
|
| 360 |
if isinstance(vision_grid, list):
|
| 361 |
+
# π Kanana λͺ¨λΈ μꡬμ¬ν: λ°°μΉ μ°¨μμ λ§μΆ€
|
| 362 |
if len(vision_grid) == 1 and len(vision_grid[0]) == 3:
|
| 363 |
+
# [(1, 34, 52)] -> (1, 1, 34, 52) ν
μλ‘ λ³ν (λ°°μΉ μ°¨μ μΆκ°)
|
| 364 |
t, h, w = vision_grid[0]
|
| 365 |
+
# π 4μ°¨μ ν
μλ‘ λ³ν: (batch_size, T, H, W) νν
|
| 366 |
+
processed_image_metas['vision_grid_thw'] = torch.tensor([[[t, h, w]]], dtype=torch.long)
|
| 367 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν: {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
| 368 |
else:
|
| 369 |
+
# π λ€λ₯Έ ννμ κ²½μ° λ°°μΉ μ°¨μ μΆκ°
|
| 370 |
+
processed_image_metas['vision_grid_thw'] = torch.tensor([vision_grid], dtype=torch.long)
|
| 371 |
print(f"π [DEBUG] vision_grid_thw ν
μ λ³ν (κΈ°λ³Έ): {vision_grid} -> {processed_image_metas['vision_grid_thw'].shape}")
|
| 372 |
else:
|
| 373 |
+
# ν
μμΈ κ²½μ° λ°°μΉ μ°¨μ νμΈ λ° μΆκ°
|
| 374 |
+
if len(vision_grid.shape) == 3:
|
| 375 |
+
processed_image_metas['vision_grid_thw'] = vision_grid.unsqueeze(0)
|
| 376 |
+
else:
|
| 377 |
+
processed_image_metas['vision_grid_thw'] = vision_grid
|
| 378 |
|
| 379 |
+
# π λ€λ₯Έ λ©νλ°μ΄ν°λ λ°°μΉ μ°¨μ λ§μΆ€
|
| 380 |
for key, value in combined_image_metas.items():
|
| 381 |
if key != 'vision_grid_thw':
|
| 382 |
+
if isinstance(value, list):
|
| 383 |
+
# 리μ€νΈμΈ κ²½μ° λ°°μΉ μ°¨μ μΆκ°
|
| 384 |
+
processed_image_metas[key] = [value]
|
| 385 |
+
elif isinstance(value, torch.Tensor) and len(value.shape) == 2:
|
| 386 |
+
# 2μ°¨μ ν
μμΈ κ²½μ° λ°°μΉ μ°¨μ μΆκ°
|
| 387 |
+
processed_image_metas[key] = value.unsqueeze(0)
|
| 388 |
+
else:
|
| 389 |
+
processed_image_metas[key] = value
|
| 390 |
|
| 391 |
generate_kwargs = {
|
| 392 |
'input_ids': input_ids,
|