Spaces:
Sleeping
Sleeping
| """ | |
| Model service for Lily LLM API | |
| """ | |
| import logging | |
| import os | |
| import asyncio | |
| import concurrent.futures | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| # ์ ์ญ ๋ณ์๋ค | |
| current_model = None # ๐ ํ์ฌ ๋ก๋๋ ๋ชจ๋ธ ์ธ์คํด์ค | |
| current_profile = None # ๐ ํ์ฌ ์ ํ๋ ๋ชจ๋ธ ํ๋กํ | |
| model_loaded = False # ๐ ๋ชจ๋ธ ๋ก๋ ์ํ | |
| model = None | |
| tokenizer = None | |
| processor = None | |
| executor = concurrent.futures.ThreadPoolExecutor() | |
| def get_current_model(): | |
| """ํ์ฌ ๋ก๋๋ ๋ชจ๋ธ ๋ฐํ""" | |
| return current_model | |
| def get_current_profile(): | |
| """ํ์ฌ ์ ํ๋ ๋ชจ๋ธ ํ๋กํ ๋ฐํ""" | |
| return current_profile | |
| def is_model_loaded(): | |
| """๋ชจ๋ธ ๋ก๋ ์ํ ๋ฐํ""" | |
| return model_loaded | |
| async def load_model_async(model_id: str): | |
| """๋ชจ๋ธ์ ๋น๋๊ธฐ์ ์ผ๋ก ๋ก๋ฉ""" | |
| loop = asyncio.get_event_loop() | |
| await loop.run_in_executor(executor, load_model_sync, model_id) | |
| def load_model_sync(model_id: str): | |
| """๋ชจ๋ธ ๋ฐ ๊ด๋ จ ํ๋ก์ธ์๋ฅผ ๋๊ธฐ์ ์ผ๋ก ๋ก๋ฉ (์ต์ข ์์ ๋ณธ)""" | |
| global model, tokenizer, processor, current_profile, current_model, model_loaded | |
| try: | |
| if model is not None: | |
| logger.info("๐๏ธ ๊ธฐ์กด ๋ชจ๋ธ ์ธ๋ก๋ ์ค...") | |
| del model | |
| del tokenizer | |
| del processor | |
| model, tokenizer, processor = None, None, None | |
| import gc | |
| gc.collect() | |
| logger.info("โ ๊ธฐ์กด ๋ชจ๋ธ ์ธ๋ก๋ ์๋ฃ") | |
| logger.info(f"๐ฅ '{model_id}' ๋ชจ๋ธ ๋ก๋ฉ ์์...") | |
| from ..models import get_model_profile | |
| current_profile = get_model_profile(model_id) | |
| # ์ด์ load_model์ (model, processor)๋ฅผ ๋ฐํํฉ๋๋ค. | |
| model, processor = current_profile.load_model() | |
| # ๐ง ์๋ฒ ์์ ์์ ์์ dtype ๊ฐ์ ์ ์ฉ (์ฒซ ์์ฒญ ์ง์ฐ ๋ฐฉ์ง) | |
| try: | |
| import torch as _torch | |
| # ๋๋ฐ์ด์ค๋ณ ๋์ dtype ๊ฒฐ์ (๊ธฐ๋ณธ: CPU=float32, CUDA=bfloat16) | |
| if hasattr(model, 'device') and str(model.device) == 'cpu': | |
| desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CPU_DTYPE') or 'float32').lower() | |
| default_target = _torch.float32 | |
| else: | |
| desired = (os.getenv('LILY_FORCE_DTYPE') or os.getenv('LILY_CUDA_DTYPE') or 'bfloat16').lower() | |
| default_target = _torch.bfloat16 | |
| desired_map = { | |
| 'float32': _torch.float32, | |
| 'fp32': _torch.float32, | |
| 'bfloat16': _torch.bfloat16, | |
| 'bf16': _torch.bfloat16, | |
| 'float16': _torch.float16, | |
| 'fp16': _torch.float16, | |
| } | |
| target_dtype = desired_map.get(desired, default_target) | |
| if hasattr(model, 'dtype') and model.dtype != target_dtype: | |
| logger.info(f"๐ง [SPEED][startup] dtype ์ ์ฉ: {model.dtype} -> {target_dtype}") | |
| model = model.to(target_dtype) | |
| except Exception as _dtype_e: | |
| logger.warning(f"โ ๏ธ [startup] dtype ์ ์ฉ ์คํจ: {_dtype_e}") | |
| # ๐ ์ ์ญ ๋ณ์์ ๋ชจ๋ธ ์ค์ (LoRA์์ ์ฌ์ฉ) | |
| current_model = model | |
| # processor์์ tokenizer๋ฅผ ๊บผ๋ด ์ ์ญ ๋ณ์์ ํ ๋นํฉ๋๋ค. | |
| if hasattr(processor, 'tokenizer'): | |
| tokenizer = processor.tokenizer | |
| else: | |
| # processor ์์ฒด๊ฐ tokenizer ์ญํ ๋ ํ ์ ์๋ ๊ฒฝ์ฐ | |
| tokenizer = processor | |
| logger.info(f"โ '{current_profile.display_name}' ๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ!") | |
| # ๐ LoRA ๊ธฐ๋ณธ ๋ชจ๋ธ ์๋ ๋ก๋ (๊ณตํต ํจ์ ์ฌ์ฉ) | |
| try: | |
| from lily_llm_core.lora_manager import get_lora_manager, lora_manager | |
| if lora_manager: | |
| from ..utils.lora_utils import setup_lora_for_model | |
| setup_lora_for_model(current_profile, lora_manager) | |
| except ImportError: | |
| logger.warning("โ ๏ธ LoRA ๊ด๋ฆฌ์ import ์คํจ") | |
| model_loaded = True | |
| except Exception as e: | |
| logger.error(f"โ load_model_sync ์คํจ: {e}") | |
| import traceback | |
| logger.error(f"๐ ์ ์ฒด ์๋ฌ: {traceback.format_exc()}") | |
| model_loaded = False | |
| raise | |
| def shutdown_executor(): | |
| """์ค๋ ๋ ํ ์คํ๊ธฐ ์ข ๋ฃ""" | |
| executor.shutdown(wait=True) | |